diff options
328 files changed, 9764 insertions, 3465 deletions
diff --git a/Documentation/DocBook/device-drivers.tmpl b/Documentation/DocBook/device-drivers.tmpl index 35447e08173..36f63d4a0a0 100644 --- a/Documentation/DocBook/device-drivers.tmpl +++ b/Documentation/DocBook/device-drivers.tmpl @@ -217,8 +217,8 @@ X!Isound/sound_firmware.c <chapter id="uart16x50"> <title>16x50 UART Driver</title> !Iinclude/linux/serial_core.h -!Edrivers/serial/serial_core.c -!Edrivers/serial/8250.c +!Edrivers/tty/serial/serial_core.c +!Edrivers/tty/serial/8250.c </chapter> <chapter id="fbdev"> diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt index b959659c5df..ccb6048415b 100644 --- a/Documentation/feature-removal-schedule.txt +++ b/Documentation/feature-removal-schedule.txt @@ -603,3 +603,10 @@ Why: The adm9240, w83792d and w83793 hardware monitoring drivers have Who: Jean Delvare <khali@linux-fr.org> ---------------------------- + +What: xt_connlimit rev 0 +When: 2012 +Who: Jan Engelhardt <jengelh@medozas.de> +Files: net/netfilter/xt_connlimit.c + +---------------------------- diff --git a/MAINTAINERS b/MAINTAINERS index 55592f8b672..cf0f3a5c09c 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3674,6 +3674,28 @@ F: include/linux/key-type.h F: include/keys/ F: security/keys/ +KEYS-TRUSTED +M: David Safford <safford@watson.ibm.com> +M: Mimi Zohar <zohar@us.ibm.com> +L: linux-security-module@vger.kernel.org +L: keyrings@linux-nfs.org +S: Supported +F: Documentation/keys-trusted-encrypted.txt +F: include/keys/trusted-type.h +F: security/keys/trusted.c +F: security/keys/trusted.h + +KEYS-ENCRYPTED +M: Mimi Zohar <zohar@us.ibm.com> +M: David Safford <safford@watson.ibm.com> +L: linux-security-module@vger.kernel.org +L: keyrings@linux-nfs.org +S: Supported +F: Documentation/keys-trusted-encrypted.txt +F: include/keys/encrypted-type.h +F: security/keys/encrypted.c +F: security/keys/encrypted.h + KGDB / KDB /debug_core M: Jason Wessel <jason.wessel@windriver.com> W: http://kgdb.wiki.kernel.org/ diff --git a/arch/arm/mach-omap1/Kconfig b/arch/arm/mach-omap1/Kconfig index 8d2f2daba0c..e0a028161dd 100644 --- a/arch/arm/mach-omap1/Kconfig +++ b/arch/arm/mach-omap1/Kconfig @@ -9,6 +9,7 @@ config ARCH_OMAP730 depends on ARCH_OMAP1 bool "OMAP730 Based System" select CPU_ARM926T + select OMAP_MPU_TIMER select ARCH_OMAP_OTG config ARCH_OMAP850 @@ -22,6 +23,7 @@ config ARCH_OMAP15XX default y bool "OMAP15xx Based System" select CPU_ARM925T + select OMAP_MPU_TIMER config ARCH_OMAP16XX depends on ARCH_OMAP1 diff --git a/arch/arm/mach-omap1/Makefile b/arch/arm/mach-omap1/Makefile index 6ee19504845..ba6009f2767 100644 --- a/arch/arm/mach-omap1/Makefile +++ b/arch/arm/mach-omap1/Makefile @@ -3,12 +3,11 @@ # # Common support -obj-y := io.o id.o sram.o irq.o mux.o flash.o serial.o devices.o dma.o +obj-y := io.o id.o sram.o time.o irq.o mux.o flash.o serial.o devices.o dma.o obj-y += clock.o clock_data.o opp_data.o obj-$(CONFIG_OMAP_MCBSP) += mcbsp.o -obj-$(CONFIG_OMAP_MPU_TIMER) += time.o obj-$(CONFIG_OMAP_32K_TIMER) += timer32k.o # Power Management diff --git a/arch/arm/mach-omap1/time.c b/arch/arm/mach-omap1/time.c index ed7a61ff916..f83fc335c61 100644 --- a/arch/arm/mach-omap1/time.c +++ b/arch/arm/mach-omap1/time.c @@ -44,16 +44,21 @@ #include <linux/clocksource.h> #include <linux/clockchips.h> #include <linux/io.h> +#include <linux/sched.h> #include <asm/system.h> #include <mach/hardware.h> #include <asm/leds.h> #include <asm/irq.h> +#include <asm/sched_clock.h> + #include <asm/mach/irq.h> #include <asm/mach/time.h> #include <plat/common.h> +#ifdef CONFIG_OMAP_MPU_TIMER + #define OMAP_MPU_TIMER_BASE OMAP_MPU_TIMER1_BASE #define OMAP_MPU_TIMER_OFFSET 0x100 @@ -67,7 +72,7 @@ typedef struct { ((volatile omap_mpu_timer_regs_t*)OMAP1_IO_ADDRESS(OMAP_MPU_TIMER_BASE + \ (n)*OMAP_MPU_TIMER_OFFSET)) -static inline unsigned long omap_mpu_timer_read(int nr) +static inline unsigned long notrace omap_mpu_timer_read(int nr) { volatile omap_mpu_timer_regs_t* timer = omap_mpu_timer_base(nr); return timer->read_tim; @@ -212,6 +217,32 @@ static struct clocksource clocksource_mpu = { .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; +static DEFINE_CLOCK_DATA(cd); + +static inline unsigned long long notrace _omap_mpu_sched_clock(void) +{ + u32 cyc = mpu_read(&clocksource_mpu); + return cyc_to_sched_clock(&cd, cyc, (u32)~0); +} + +#ifndef CONFIG_OMAP_32K_TIMER +unsigned long long notrace sched_clock(void) +{ + return _omap_mpu_sched_clock(); +} +#else +static unsigned long long notrace omap_mpu_sched_clock(void) +{ + return _omap_mpu_sched_clock(); +} +#endif + +static void notrace mpu_update_sched_clock(void) +{ + u32 cyc = mpu_read(&clocksource_mpu); + update_sched_clock(&cd, cyc, (u32)~0); +} + static void __init omap_init_clocksource(unsigned long rate) { static char err[] __initdata = KERN_ERR @@ -219,17 +250,13 @@ static void __init omap_init_clocksource(unsigned long rate) setup_irq(INT_TIMER2, &omap_mpu_timer2_irq); omap_mpu_timer_start(1, ~0, 1); + init_sched_clock(&cd, mpu_update_sched_clock, 32, rate); if (clocksource_register_hz(&clocksource_mpu, rate)) printk(err, clocksource_mpu.name); } -/* - * --------------------------------------------------------------------------- - * Timer initialization - * --------------------------------------------------------------------------- - */ -static void __init omap_timer_init(void) +static void __init omap_mpu_timer_init(void) { struct clk *ck_ref = clk_get(NULL, "ck_ref"); unsigned long rate; @@ -246,6 +273,66 @@ static void __init omap_timer_init(void) omap_init_clocksource(rate); } +#else +static inline void omap_mpu_timer_init(void) +{ + pr_err("Bogus timer, should not happen\n"); +} +#endif /* CONFIG_OMAP_MPU_TIMER */ + +#if defined(CONFIG_OMAP_MPU_TIMER) && defined(CONFIG_OMAP_32K_TIMER) +static unsigned long long (*preferred_sched_clock)(void); + +unsigned long long notrace sched_clock(void) +{ + if (!preferred_sched_clock) + return 0; + + return preferred_sched_clock(); +} + +static inline void preferred_sched_clock_init(bool use_32k_sched_clock) +{ + if (use_32k_sched_clock) + preferred_sched_clock = omap_32k_sched_clock; + else + preferred_sched_clock = omap_mpu_sched_clock; +} +#else +static inline void preferred_sched_clock_init(bool use_32k_sched_clcok) +{ +} +#endif + +static inline int omap_32k_timer_usable(void) +{ + int res = false; + + if (cpu_is_omap730() || cpu_is_omap15xx()) + return res; + +#ifdef CONFIG_OMAP_32K_TIMER + res = omap_32k_timer_init(); +#endif + + return res; +} + +/* + * --------------------------------------------------------------------------- + * Timer initialization + * --------------------------------------------------------------------------- + */ +static void __init omap_timer_init(void) +{ + if (omap_32k_timer_usable()) { + preferred_sched_clock_init(1); + } else { + omap_mpu_timer_init(); + preferred_sched_clock_init(0); + } +} + struct sys_timer omap_timer = { .init = omap_timer_init, }; diff --git a/arch/arm/mach-omap1/timer32k.c b/arch/arm/mach-omap1/timer32k.c index 20cfbcc6c60..13d7b8f145b 100644 --- a/arch/arm/mach-omap1/timer32k.c +++ b/arch/arm/mach-omap1/timer32k.c @@ -52,10 +52,9 @@ #include <asm/irq.h> #include <asm/mach/irq.h> #include <asm/mach/time.h> +#include <plat/common.h> #include <plat/dmtimer.h> -struct sys_timer omap_timer; - /* * --------------------------------------------------------------------------- * 32KHz OS timer @@ -181,14 +180,14 @@ static __init void omap_init_32k_timer(void) * Timer initialization * --------------------------------------------------------------------------- */ -static void __init omap_timer_init(void) +bool __init omap_32k_timer_init(void) { + omap_init_clocksource_32k(); + #ifdef CONFIG_OMAP_DM_TIMER omap_dm_timer_init(); #endif omap_init_32k_timer(); -} -struct sys_timer omap_timer = { - .init = omap_timer_init, -}; + return true; +} diff --git a/arch/arm/mach-omap2/board-cm-t3517.c b/arch/arm/mach-omap2/board-cm-t3517.c index 5b0c77732df..8f9a64d650e 100644 --- a/arch/arm/mach-omap2/board-cm-t3517.c +++ b/arch/arm/mach-omap2/board-cm-t3517.c @@ -124,8 +124,9 @@ static inline void cm_t3517_init_hecc(void) {} #if defined(CONFIG_RTC_DRV_V3020) || defined(CONFIG_RTC_DRV_V3020_MODULE) #define RTC_IO_GPIO (153) #define RTC_WR_GPIO (154) -#define RTC_RD_GPIO (160) +#define RTC_RD_GPIO (53) #define RTC_CS_GPIO (163) +#define RTC_CS_EN_GPIO (160) struct v3020_platform_data cm_t3517_v3020_pdata = { .use_gpio = 1, @@ -145,6 +146,16 @@ static struct platform_device cm_t3517_rtc_device = { static void __init cm_t3517_init_rtc(void) { + int err; + + err = gpio_request(RTC_CS_EN_GPIO, "rtc cs en"); + if (err) { + pr_err("CM-T3517: rtc cs en gpio request failed: %d\n", err); + return; + } + + gpio_direction_output(RTC_CS_EN_GPIO, 1); + platform_device_register(&cm_t3517_rtc_device); } #else @@ -214,12 +225,12 @@ static struct mtd_partition cm_t3517_nand_partitions[] = { }, { .name = "linux", - .offset = MTDPART_OFS_APPEND, /* Offset = 0x280000 */ + .offset = MTDPART_OFS_APPEND, /* Offset = 0x2A0000 */ .size = 32 * NAND_BLOCK_SIZE, }, { .name = "rootfs", - .offset = MTDPART_OFS_APPEND, /* Offset = 0x680000 */ + .offset = MTDPART_OFS_APPEND, /* Offset = 0x6A0000 */ .size = MTDPART_SIZ_FULL, }, }; @@ -256,11 +267,19 @@ static void __init cm_t3517_init_irq(void) static struct omap_board_mux board_mux[] __initdata = { /* GPIO186 - Green LED */ OMAP3_MUX(SYS_CLKOUT2, OMAP_MUX_MODE4 | OMAP_PIN_OUTPUT), - /* RTC GPIOs: IO, WR#, RD#, CS# */ + + /* RTC GPIOs: */ + /* IO - GPIO153 */ OMAP3_MUX(MCBSP4_DR, OMAP_MUX_MODE4 | OMAP_PIN_INPUT), + /* WR# - GPIO154 */ OMAP3_MUX(MCBSP4_DX, OMAP_MUX_MODE4 | OMAP_PIN_INPUT), - OMAP3_MUX(MCBSP_CLKS, OMAP_MUX_MODE4 | OMAP_PIN_INPUT), + /* RD# - GPIO53 */ + OMAP3_MUX(GPMC_NCS2, OMAP_MUX_MODE4 | OMAP_PIN_INPUT), + /* CS# - GPIO163 */ OMAP3_MUX(UART3_CTS_RCTX, OMAP_MUX_MODE4 | OMAP_PIN_INPUT), + /* CS EN - GPIO160 */ + OMAP3_MUX(MCBSP_CLKS, OMAP_MUX_MODE4 | OMAP_PIN_INPUT), + /* HSUSB1 RESET */ OMAP3_MUX(UART2_TX, OMAP_MUX_MODE4 | OMAP_PIN_OUTPUT), /* HSUSB2 RESET */ diff --git a/arch/arm/mach-omap2/board-devkit8000.c b/arch/arm/mach-omap2/board-devkit8000.c index 00bb1fc5e01..e906e05bb41 100644 --- a/arch/arm/mach-omap2/board-devkit8000.c +++ b/arch/arm/mach-omap2/board-devkit8000.c @@ -275,8 +275,7 @@ static struct twl4030_gpio_platform_data devkit8000_gpio_data = { .irq_base = TWL4030_GPIO_IRQ_BASE, .irq_end = TWL4030_GPIO_IRQ_END, .use_leds = true, - .pullups = BIT(1), - .pulldowns = BIT(2) | BIT(6) | BIT(7) | BIT(8) | BIT(13) + .pulldowns = BIT(1) | BIT(2) | BIT(6) | BIT(8) | BIT(13) | BIT(15) | BIT(16) | BIT(17), .setup = devkit8000_twl_gpio_setup, }; diff --git a/arch/arm/mach-omap2/clock44xx_data.c b/arch/arm/mach-omap2/clock44xx_data.c index e8cb32fd7f1..de9ec8ddd2a 100644 --- a/arch/arm/mach-omap2/clock44xx_data.c +++ b/arch/arm/mach-omap2/clock44xx_data.c @@ -34,7 +34,6 @@ #include "cm2_44xx.h" #include "cm-regbits-44xx.h" #include "prm44xx.h" -#include "prm44xx.h" #include "prm-regbits-44xx.h" #include "control.h" #include "scrm44xx.h" diff --git a/arch/arm/mach-omap2/clockdomain.c b/arch/arm/mach-omap2/clockdomain.c index e20b98636ab..58e42f76603 100644 --- a/arch/arm/mach-omap2/clockdomain.c +++ b/arch/arm/mach-omap2/clockdomain.c @@ -423,6 +423,12 @@ int clkdm_add_wkdep(struct clockdomain *clkdm1, struct clockdomain *clkdm2) { struct clkdm_dep *cd; + if (!cpu_is_omap24xx() && !cpu_is_omap34xx()) { + pr_err("clockdomain: %s/%s: %s: not yet implemented\n", + clkdm1->name, clkdm2->name, __func__); + return -EINVAL; + } + if (!clkdm1 || !clkdm2) return -EINVAL; @@ -458,6 +464,12 @@ int clkdm_del_wkdep(struct clockdomain *clkdm1, struct clockdomain *clkdm2) { struct clkdm_dep *cd; + if (!cpu_is_omap24xx() && !cpu_is_omap34xx()) { + pr_err("clockdomain: %s/%s: %s: not yet implemented\n", + clkdm1->name, clkdm2->name, __func__); + return -EINVAL; + } + if (!clkdm1 || !clkdm2) return -EINVAL; @@ -500,6 +512,12 @@ int clkdm_read_wkdep(struct clockdomain *clkdm1, struct clockdomain *clkdm2) if (!clkdm1 || !clkdm2) return -EINVAL; + if (!cpu_is_omap24xx() && !cpu_is_omap34xx()) { + pr_err("clockdomain: %s/%s: %s: not yet implemented\n", + clkdm1->name, clkdm2->name, __func__); + return -EINVAL; + } + cd = _clkdm_deps_lookup(clkdm2, clkdm1->wkdep_srcs); if (IS_ERR(cd)) { pr_debug("clockdomain: hardware cannot set/clear wake up of " @@ -527,6 +545,12 @@ int clkdm_clear_all_wkdeps(struct clockdomain *clkdm) struct clkdm_dep *cd; u32 mask = 0; + if (!cpu_is_omap24xx() && !cpu_is_omap34xx()) { + pr_err("clockdomain: %s: %s: not yet implemented\n", + clkdm->name, __func__); + return -EINVAL; + } + if (!clkdm) return -EINVAL; @@ -830,8 +854,7 @@ void omap2_clkdm_allow_idle(struct clockdomain *clkdm) * dependency code and data for OMAP4. */ if (cpu_is_omap44xx()) { - WARN_ONCE(1, "clockdomain: OMAP4 wakeup/sleep dependency " - "support is not yet implemented\n"); + pr_err("clockdomain: %s: OMAP4 wakeup/sleep dependency support: not yet implemented\n", clkdm->name); } else { if (atomic_read(&clkdm->usecount) > 0) _clkdm_add_autodeps(clkdm); @@ -872,8 +895,7 @@ void omap2_clkdm_deny_idle(struct clockdomain *clkdm) * dependency code and data for OMAP4. */ if (cpu_is_omap44xx()) { - WARN_ONCE(1, "clockdomain: OMAP4 wakeup/sleep dependency " - "support is not yet implemented\n"); + pr_err("clockdomain: %s: OMAP4 wakeup/sleep dependency support: not yet implemented\n", clkdm->name); } else { if (atomic_read(&clkdm->usecount) > 0) _clkdm_del_autodeps(clkdm); diff --git a/arch/arm/mach-omap2/clockdomains44xx_data.c b/arch/arm/mach-omap2/clockdomains44xx_data.c index 51920fc7fc5..10622c914ab 100644 --- a/arch/arm/mach-omap2/clockdomains44xx_data.c +++ b/arch/arm/mach-omap2/clockdomains44xx_data.c @@ -30,8 +30,6 @@ #include "cm1_44xx.h" #include "cm2_44xx.h" -#include "cm1_44xx.h" -#include "cm2_44xx.h" #include "cm-regbits-44xx.h" #include "prm44xx.h" #include "prcm44xx.h" diff --git a/arch/arm/mach-omap2/powerdomain2xxx_3xxx.c b/arch/arm/mach-omap2/powerdomain2xxx_3xxx.c index d5233890370..cf600e22bf8 100644 --- a/arch/arm/mach-omap2/powerdomain2xxx_3xxx.c +++ b/arch/arm/mach-omap2/powerdomain2xxx_3xxx.c @@ -19,7 +19,6 @@ #include <plat/prcm.h> #include "powerdomain.h" -#include "prm-regbits-34xx.h" #include "prm.h" #include "prm-regbits-24xx.h" #include "prm-regbits-34xx.h" diff --git a/arch/arm/mach-omap2/timer-gp.c b/arch/arm/mach-omap2/timer-gp.c index 4e48e786bec..7b7c2683ae7 100644 --- a/arch/arm/mach-omap2/timer-gp.c +++ b/arch/arm/mach-omap2/timer-gp.c @@ -42,6 +42,8 @@ #include "timer-gp.h" +#include <plat/common.h> + /* MAX_GPTIMER_ID: number of GPTIMERs on the chip */ #define MAX_GPTIMER_ID 12 @@ -176,10 +178,14 @@ static void __init omap2_gp_clockevent_init(void) /* * When 32k-timer is enabled, don't use GPTimer for clocksource * instead, just leave default clocksource which uses the 32k - * sync counter. See clocksource setup in see plat-omap/common.c. + * sync counter. See clocksource setup in plat-omap/counter_32k.c */ -static inline void __init omap2_gp_clocksource_init(void) {} +static void __init omap2_gp_clocksource_init(void) +{ + omap_init_clocksource_32k(); +} + #else /* * clocksource diff --git a/arch/arm/plat-omap/Kconfig b/arch/arm/plat-omap/Kconfig index 18fe3cb195d..b6333ae3f92 100644 --- a/arch/arm/plat-omap/Kconfig +++ b/arch/arm/plat-omap/Kconfig @@ -144,12 +144,9 @@ config OMAP_IOMMU_DEBUG config OMAP_IOMMU_IVA2 bool -choice - prompt "System timer" - default OMAP_32K_TIMER if !ARCH_OMAP15XX - config OMAP_MPU_TIMER bool "Use mpu timer" + depends on ARCH_OMAP1 help Select this option if you want to use the OMAP mpu timer. This timer provides more intra-tick resolution than the 32KHz timer, @@ -158,6 +155,7 @@ config OMAP_MPU_TIMER config OMAP_32K_TIMER bool "Use 32KHz timer" depends on ARCH_OMAP16XX || ARCH_OMAP2PLUS + default y if (ARCH_OMAP16XX || ARCH_OMAP2PLUS) help Select this option if you want to enable the OMAP 32KHz timer. This timer saves power compared to the OMAP_MPU_TIMER, and has @@ -165,8 +163,6 @@ config OMAP_32K_TIMER intra-tick resolution than OMAP_MPU_TIMER. The 32KHz timer is currently only available for OMAP16XX, 24XX, 34XX and OMAP4. -endchoice - config OMAP3_L2_AUX_SECURE_SAVE_RESTORE bool "OMAP3 HS/EMU save and restore for L2 AUX control register" depends on ARCH_OMAP3 && PM diff --git a/arch/arm/plat-omap/counter_32k.c b/arch/arm/plat-omap/counter_32k.c index ea4644021fb..862dda95d61 100644 --- a/arch/arm/plat-omap/counter_32k.c +++ b/arch/arm/plat-omap/counter_32k.c @@ -36,8 +36,6 @@ #define OMAP16XX_TIMER_32K_SYNCHRONIZED 0xfffbc410 -#if !(defined(CONFIG_ARCH_OMAP730) || defined(CONFIG_ARCH_OMAP15XX)) - #include <linux/clocksource.h> /* @@ -122,12 +120,24 @@ static DEFINE_CLOCK_DATA(cd); #define SC_MULT 4000000000u #define SC_SHIFT 17 -unsigned long long notrace sched_clock(void) +static inline unsigned long long notrace _omap_32k_sched_clock(void) { u32 cyc = clocksource_32k.read(&clocksource_32k); return cyc_to_fixed_sched_clock(&cd, cyc, (u32)~0, SC_MULT, SC_SHIFT); } +#ifndef CONFIG_OMAP_MPU_TIMER +unsigned long long notrace sched_clock(void) +{ + return _omap_32k_sched_clock(); +} +#else +unsigned long long notrace omap_32k_sched_clock(void) +{ + return _omap_32k_sched_clock(); +} +#endif + static void notrace omap_update_sched_clock(void) { u32 cyc = clocksource_32k.read(&clocksource_32k); @@ -160,7 +170,7 @@ void read_persistent_clock(struct timespec *ts) *ts = *tsp; } -static int __init omap_init_clocksource_32k(void) +int __init omap_init_clocksource_32k(void) { static char err[] __initdata = KERN_ERR "%s: can't register clocksource!\n"; @@ -195,7 +205,3 @@ static int __init omap_init_clocksource_32k(void) } return 0; } -arch_initcall(omap_init_clocksource_32k); - -#endif /* !(defined(CONFIG_ARCH_OMAP730) || defined(CONFIG_ARCH_OMAP15XX)) */ - diff --git a/arch/arm/plat-omap/dma.c b/arch/arm/plat-omap/dma.c index c4b2b478b1a..85363084cc1 100644 --- a/arch/arm/plat-omap/dma.c +++ b/arch/arm/plat-omap/dma.c @@ -53,7 +53,7 @@ enum { DMA_CHAIN_STARTED, DMA_CHAIN_NOTSTARTED }; #endif #define OMAP_DMA_ACTIVE 0x01 -#define OMAP2_DMA_CSR_CLEAR_MASK 0xffe +#define OMAP2_DMA_CSR_CLEAR_MASK 0xffffffff #define OMAP_FUNC_MUX_ARM_BASE (0xfffe1000 + 0xec) @@ -1873,7 +1873,7 @@ static int omap2_dma_handle_ch(int ch) printk(KERN_INFO "DMA misaligned error with device %d\n", dma_chan[ch].dev_id); - p->dma_write(OMAP2_DMA_CSR_CLEAR_MASK, CSR, ch); + p->dma_write(status, CSR, ch); p->dma_write(1 << ch, IRQSTATUS_L0, ch); /* read back the register to flush the write */ p->dma_read(IRQSTATUS_L0, ch); @@ -1893,10 +1893,9 @@ static int omap2_dma_handle_ch(int ch) OMAP_DMA_CHAIN_INCQHEAD(chain_id); status = p->dma_read(CSR, ch); + p->dma_write(status, CSR, ch); } - p->dma_write(status, CSR, ch); - if (likely(dma_chan[ch].callback != NULL)) dma_chan[ch].callback(ch, status, dma_chan[ch].data); diff --git a/arch/arm/plat-omap/include/plat/common.h b/arch/arm/plat-omap/include/plat/common.h index 6b8088ec74a..29b2afb4288 100644 --- a/arch/arm/plat-omap/include/plat/common.h +++ b/arch/arm/plat-omap/include/plat/common.h @@ -35,6 +35,9 @@ struct sys_timer; extern void omap_map_common_io(void); extern struct sys_timer omap_timer; +extern bool omap_32k_timer_init(void); +extern int __init omap_init_clocksource_32k(void); +extern unsigned long long notrace omap_32k_sched_clock(void); extern void omap_reserve(void); diff --git a/arch/powerpc/kernel/perf_event_fsl_emb.c b/arch/powerpc/kernel/perf_event_fsl_emb.c index 4dcf5f831e9..b0dc8f7069c 100644 --- a/arch/powerpc/kernel/perf_event_fsl_emb.c +++ b/arch/powerpc/kernel/perf_event_fsl_emb.c @@ -596,6 +596,7 @@ static void record_and_restart(struct perf_event *event, unsigned long val, if (left <= 0) left = period; record = 1; + event->hw.last_period = event->hw.sample_period; } if (left < 0x80000000LL) val = 0x80000000LL - left; diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h index 63e35ec9075..62f084478f7 100644 --- a/arch/x86/include/asm/cacheflush.h +++ b/arch/x86/include/asm/cacheflush.h @@ -1,48 +1,8 @@ #ifndef _ASM_X86_CACHEFLUSH_H #define _ASM_X86_CACHEFLUSH_H -/* Keep includes the same across arches. */ -#include <linux/mm.h> - /* Caches aren't brain-dead on the intel. */ -static inline void flush_cache_all(void) { } -static inline void flush_cache_mm(struct mm_struct *mm) { } -static inline void flush_cache_dup_mm(struct mm_struct *mm) { } -static inline void flush_cache_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end) { } -static inline void flush_cache_page(struct vm_area_struct *vma, - unsigned long vmaddr, unsigned long pfn) { } -#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0 -static inline void flush_dcache_page(struct page *page) { } -static inline void flush_dcache_mmap_lock(struct address_space *mapping) { } -static inline void flush_dcache_mmap_unlock(struct address_space *mapping) { } -static inline void flush_icache_range(unsigned long start, - unsigned long end) { } -static inline void flush_icache_page(struct vm_area_struct *vma, - struct page *page) { } -static inline void flush_icache_user_range(struct vm_area_struct *vma, - struct page *page, - unsigned long addr, - unsigned long len) { } -static inline void flush_cache_vmap(unsigned long start, unsigned long end) { } -static inline void flush_cache_vunmap(unsigned long start, - unsigned long end) { } - -static inline void copy_to_user_page(struct vm_area_struct *vma, - struct page *page, unsigned long vaddr, - void *dst, const void *src, - unsigned long len) -{ - memcpy(dst, src, len); -} - -static inline void copy_from_user_page(struct vm_area_struct *vma, - struct page *page, unsigned long vaddr, - void *dst, const void *src, - unsigned long len) -{ - memcpy(dst, src, len); -} +#include <asm-generic/cacheflush.h> #ifdef CONFIG_X86_PAT /* diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h index 4fab24de26b..6e6e7558e70 100644 --- a/arch/x86/include/asm/cpu.h +++ b/arch/x86/include/asm/cpu.h @@ -32,5 +32,6 @@ extern void arch_unregister_cpu(int); DECLARE_PER_CPU(int, cpu_state); +int __cpuinit mwait_usable(const struct cpuinfo_x86 *); #endif /* _ASM_X86_CPU_H */ diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h index f52d42e8058..574dbc22893 100644 --- a/arch/x86/include/asm/jump_label.h +++ b/arch/x86/include/asm/jump_label.h @@ -14,7 +14,7 @@ do { \ asm goto("1:" \ JUMP_LABEL_INITIAL_NOP \ - ".pushsection __jump_table, \"a\" \n\t"\ + ".pushsection __jump_table, \"aw\" \n\t"\ _ASM_PTR "1b, %l[" #label "], %c0 \n\t" \ ".popsection \n\t" \ : : "i" (key) : : label); \ diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 7283e98deaa..ec2c19a7b8e 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -45,6 +45,7 @@ static const struct _cache_table __cpuinitconst cache_table[] = { 0x0a, LVL_1_DATA, 8 }, /* 2 way set assoc, 32 byte line size */ { 0x0c, LVL_1_DATA, 16 }, /* 4-way set assoc, 32 byte line size */ { 0x0d, LVL_1_DATA, 16 }, /* 4-way set assoc, 64 byte line size */ + { 0x0e, LVL_1_DATA, 24 }, /* 6-way set assoc, 64 byte line size */ { 0x21, LVL_2, 256 }, /* 8-way set assoc, 64 byte line size */ { 0x22, LVL_3, 512 }, /* 4-way set assoc, sectored cache, 64 byte line size */ { 0x23, LVL_3, MB(1) }, /* 8-way set assoc, sectored cache, 64 byte line size */ @@ -66,6 +67,7 @@ static const struct _cache_table __cpuinitconst cache_table[] = { 0x45, LVL_2, MB(2) }, /* 4-way set assoc, 32 byte line size */ { 0x46, LVL_3, MB(4) }, /* 4-way set assoc, 64 byte line size */ { 0x47, LVL_3, MB(8) }, /* 8-way set assoc, 64 byte line size */ + { 0x48, LVL_2, MB(3) }, /* 12-way set assoc, 64 byte line size */ { 0x49, LVL_3, MB(4) }, /* 16-way set assoc, 64 byte line size */ { 0x4a, LVL_3, MB(6) }, /* 12-way set assoc, 64 byte line size */ { 0x4b, LVL_3, MB(8) }, /* 16-way set assoc, 64 byte line size */ @@ -87,6 +89,7 @@ static const struct _cache_table __cpuinitconst cache_table[] = { 0x7c, LVL_2, MB(1) }, /* 8-way set assoc, sectored cache, 64 byte line size */ { 0x7d, LVL_2, MB(2) }, /* 8-way set assoc, 64 byte line size */ { 0x7f, LVL_2, 512 }, /* 2-way set assoc, 64 byte line size */ + { 0x80, LVL_2, 512 }, /* 8-way set assoc, 64 byte line size */ { 0x82, LVL_2, 256 }, /* 8-way set assoc, 32 byte line size */ { 0x83, LVL_2, 512 }, /* 8-way set assoc, 32 byte line size */ { 0x84, LVL_2, MB(1) }, /* 8-way set assoc, 32 byte line size */ diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index e12246ff5aa..6f8c5e9da97 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -59,6 +59,7 @@ struct thermal_state { /* Callback to handle core threshold interrupts */ int (*platform_thermal_notify)(__u64 msr_val); +EXPORT_SYMBOL(platform_thermal_notify); static DEFINE_PER_CPU(struct thermal_state, thermal_state); diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index d8286ed54ff..e764fc05d70 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -14,6 +14,7 @@ #include <linux/utsname.h> #include <trace/events/power.h> #include <linux/hw_breakpoint.h> +#include <asm/cpu.h> #include <asm/system.h> #include <asm/apic.h> #include <asm/syscalls.h> @@ -505,7 +506,7 @@ static void poll_idle(void) #define MWAIT_ECX_EXTENDED_INFO 0x01 #define MWAIT_EDX_C1 0xf0 -static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c) +int __cpuinit mwait_usable(const struct cpuinfo_x86 *c) { u32 eax, ebx, ecx, edx; diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 763df77343d..0cbe8c0b35e 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1402,8 +1402,9 @@ static inline void mwait_play_dead(void) unsigned int highest_subcstate = 0; int i; void *mwait_ptr; + struct cpuinfo_x86 *c = __this_cpu_ptr(&cpu_info); - if (!cpu_has(__this_cpu_ptr(&cpu_info), X86_FEATURE_MWAIT)) + if (!(cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c))) return; if (!cpu_has(__this_cpu_ptr(&cpu_info), X86_FEATURE_CLFLSH)) return; diff --git a/drivers/atm/idt77105.c b/drivers/atm/idt77105.c index bca9cb89a11..487a5473985 100644 --- a/drivers/atm/idt77105.c +++ b/drivers/atm/idt77105.c @@ -151,7 +151,7 @@ static int fetch_stats(struct atm_dev *dev,struct idt77105_stats __user *arg,int spin_unlock_irqrestore(&idt77105_priv_lock, flags); if (arg == NULL) return 0; - return copy_to_user(arg, &PRIV(dev)->stats, + return copy_to_user(arg, &stats, sizeof(struct idt77105_stats)) ? -EFAULT : 0; } diff --git a/drivers/char/tpm/tpm.c b/drivers/char/tpm/tpm.c index 1f46f1cd922..36e0fa161c2 100644 --- a/drivers/char/tpm/tpm.c +++ b/drivers/char/tpm/tpm.c @@ -364,12 +364,14 @@ unsigned long tpm_calc_ordinal_duration(struct tpm_chip *chip, tpm_protected_ordinal_duration[ordinal & TPM_PROTECTED_ORDINAL_MASK]; - if (duration_idx != TPM_UNDEFINED) + if (duration_idx != TPM_UNDEFINED) { duration = chip->vendor.duration[duration_idx]; - if (duration <= 0) + /* if duration is 0, it's because chip->vendor.duration wasn't */ + /* filled yet, so we set the lowest timeout just to give enough */ + /* time for tpm_get_timeouts() to succeed */ + return (duration <= 0 ? HZ : duration); + } else return 2 * 60 * HZ; - else - return duration; } EXPORT_SYMBOL_GPL(tpm_calc_ordinal_duration); diff --git a/drivers/char/tpm/tpm_tis.c b/drivers/char/tpm/tpm_tis.c index c17a305ecb2..dd21df55689 100644 --- a/drivers/char/tpm/tpm_tis.c +++ b/drivers/char/tpm/tpm_tis.c @@ -493,9 +493,6 @@ static int tpm_tis_init(struct device *dev, resource_size_t start, "1.2 TPM (device-id 0x%X, rev-id %d)\n", vendor >> 16, ioread8(chip->vendor.iobase + TPM_RID(0))); - if (is_itpm(to_pnp_dev(dev))) - itpm = 1; - if (itpm) dev_info(dev, "Intel iTPM workaround enabled\n"); @@ -637,6 +634,9 @@ static int __devinit tpm_tis_pnp_init(struct pnp_dev *pnp_dev, else interrupts = 0; + if (is_itpm(pnp_dev)) + itpm = 1; + return tpm_tis_init(&pnp_dev->dev, start, len, irq); } diff --git a/drivers/clocksource/acpi_pm.c b/drivers/clocksource/acpi_pm.c index cfb0f527841..effe7974aa9 100644 --- a/drivers/clocksource/acpi_pm.c +++ b/drivers/clocksource/acpi_pm.c @@ -202,17 +202,21 @@ static int __init init_acpi_pm_clocksource(void) printk(KERN_INFO "PM-Timer had inconsistent results:" " 0x%#llx, 0x%#llx - aborting.\n", value1, value2); + pmtmr_ioport = 0; return -EINVAL; } if (i == ACPI_PM_READ_CHECKS) { printk(KERN_INFO "PM-Timer failed consistency check " " (0x%#llx) - aborting.\n", value1); + pmtmr_ioport = 0; return -ENODEV; } } - if (verify_pmtmr_rate() != 0) + if (verify_pmtmr_rate() != 0){ + pmtmr_ioport = 0; return -ENODEV; + } return clocksource_register_hz(&clocksource_acpi_pm, PMTMR_TICKS_PER_SEC); diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index 7acb32e7f81..1fa091e0569 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -263,7 +263,7 @@ static void __setup_broadcast_timer(void *arg) clockevents_notify(reason, &cpu); } -static int __cpuinit setup_broadcast_cpuhp_notify(struct notifier_block *n, +static int setup_broadcast_cpuhp_notify(struct notifier_block *n, unsigned long action, void *hcpu) { int hotcpu = (unsigned long)hcpu; @@ -273,15 +273,11 @@ static int __cpuinit setup_broadcast_cpuhp_notify(struct notifier_block *n, smp_call_function_single(hotcpu, __setup_broadcast_timer, (void *)true, 1); break; - case CPU_DOWN_PREPARE: - smp_call_function_single(hotcpu, __setup_broadcast_timer, - (void *)false, 1); - break; } return NOTIFY_OK; } -static struct notifier_block __cpuinitdata setup_broadcast_notifier = { +static struct notifier_block setup_broadcast_notifier = { .notifier_call = setup_broadcast_cpuhp_notify, }; diff --git a/drivers/net/arm/ks8695net.c b/drivers/net/arm/ks8695net.c index 62d6f88cbab..aa07657744c 100644 --- a/drivers/net/arm/ks8695net.c +++ b/drivers/net/arm/ks8695net.c @@ -1644,7 +1644,7 @@ ks8695_cleanup(void) module_init(ks8695_init); module_exit(ks8695_cleanup); -MODULE_AUTHOR("Simtec Electronics") +MODULE_AUTHOR("Simtec Electronics"); MODULE_DESCRIPTION("Micrel KS8695 (Centaur) Ethernet driver"); MODULE_LICENSE("GPL"); MODULE_ALIAS("platform:" MODULENAME); diff --git a/drivers/net/atl1c/atl1c_hw.c b/drivers/net/atl1c/atl1c_hw.c index 1bf67200994..23f2ab0f2fa 100644 --- a/drivers/net/atl1c/atl1c_hw.c +++ b/drivers/net/atl1c/atl1c_hw.c @@ -345,7 +345,7 @@ int atl1c_write_phy_reg(struct atl1c_hw *hw, u32 reg_addr, u16 phy_data) */ static int atl1c_phy_setup_adv(struct atl1c_hw *hw) { - u16 mii_adv_data = ADVERTISE_DEFAULT_CAP & ~ADVERTISE_SPEED_MASK; + u16 mii_adv_data = ADVERTISE_DEFAULT_CAP & ~ADVERTISE_ALL; u16 mii_giga_ctrl_data = GIGA_CR_1000T_DEFAULT_CAP & ~GIGA_CR_1000T_SPEED_MASK; @@ -373,7 +373,7 @@ static int atl1c_phy_setup_adv(struct atl1c_hw *hw) } if (atl1c_write_phy_reg(hw, MII_ADVERTISE, mii_adv_data) != 0 || - atl1c_write_phy_reg(hw, MII_GIGA_CR, mii_giga_ctrl_data) != 0) + atl1c_write_phy_reg(hw, MII_CTRL1000, mii_giga_ctrl_data) != 0) return -1; return 0; } @@ -517,19 +517,18 @@ int atl1c_phy_init(struct atl1c_hw *hw) "Error Setting up Auto-Negotiation\n"); return ret_val; } - mii_bmcr_data |= BMCR_AUTO_NEG_EN | BMCR_RESTART_AUTO_NEG; + mii_bmcr_data |= BMCR_ANENABLE | BMCR_ANRESTART; break; case MEDIA_TYPE_100M_FULL: - mii_bmcr_data |= BMCR_SPEED_100 | BMCR_FULL_DUPLEX; + mii_bmcr_data |= BMCR_SPEED100 | BMCR_FULLDPLX; break; case MEDIA_TYPE_100M_HALF: - mii_bmcr_data |= BMCR_SPEED_100; + mii_bmcr_data |= BMCR_SPEED100; break; case MEDIA_TYPE_10M_FULL: - mii_bmcr_data |= BMCR_SPEED_10 | BMCR_FULL_DUPLEX; + mii_bmcr_data |= BMCR_FULLDPLX; break; case MEDIA_TYPE_10M_HALF: - mii_bmcr_data |= BMCR_SPEED_10; break; default: if (netif_msg_link(adapter)) @@ -657,7 +656,7 @@ int atl1c_restart_autoneg(struct atl1c_hw *hw) err = atl1c_phy_setup_adv(hw); if (err) return err; - mii_bmcr_data |= BMCR_AUTO_NEG_EN | BMCR_RESTART_AUTO_NEG; + mii_bmcr_data |= BMCR_ANENABLE | BMCR_ANRESTART; return atl1c_write_phy_reg(hw, MII_BMCR, mii_bmcr_data); } diff --git a/drivers/net/atl1c/atl1c_hw.h b/drivers/net/atl1c/atl1c_hw.h index 3dd675979aa..655fc6c4a8a 100644 --- a/drivers/net/atl1c/atl1c_hw.h +++ b/drivers/net/atl1c/atl1c_hw.h @@ -736,55 +736,16 @@ int atl1c_phy_power_saving(struct atl1c_hw *hw); #define REG_DEBUG_DATA0 0x1900 #define REG_DEBUG_DATA1 0x1904 -/* PHY Control Register */ -#define MII_BMCR 0x00 -#define BMCR_SPEED_SELECT_MSB 0x0040 /* bits 6,13: 10=1000, 01=100, 00=10 */ -#define BMCR_COLL_TEST_ENABLE 0x0080 /* Collision test enable */ -#define BMCR_FULL_DUPLEX 0x0100 /* FDX =1, half duplex =0 */ -#define BMCR_RESTART_AUTO_NEG 0x0200 /* Restart auto negotiation */ -#define BMCR_ISOLATE 0x0400 /* Isolate PHY from MII */ -#define BMCR_POWER_DOWN 0x0800 /* Power down */ -#define BMCR_AUTO_NEG_EN 0x1000 /* Auto Neg Enable */ -#define BMCR_SPEED_SELECT_LSB 0x2000 /* bits 6,13: 10=1000, 01=100, 00=10 */ -#define BMCR_LOOPBACK 0x4000 /* 0 = normal, 1 = loopback */ -#define BMCR_RESET 0x8000 /* 0 = normal, 1 = PHY reset */ -#define BMCR_SPEED_MASK 0x2040 -#define BMCR_SPEED_1000 0x0040 -#define BMCR_SPEED_100 0x2000 -#define BMCR_SPEED_10 0x0000 - -/* PHY Status Register */ -#define MII_BMSR 0x01 -#define BMMSR_EXTENDED_CAPS 0x0001 /* Extended register capabilities */ -#define BMSR_JABBER_DETECT 0x0002 /* Jabber Detected */ -#define BMSR_LINK_STATUS 0x0004 /* Link Status 1 = link */ -#define BMSR_AUTONEG_CAPS 0x0008 /* Auto Neg Capable */ -#define BMSR_REMOTE_FAULT 0x0010 /* Remote Fault Detect */ -#define BMSR_AUTONEG_COMPLETE 0x0020 /* Auto Neg Complete */ -#define BMSR_PREAMBLE_SUPPRESS 0x0040 /* Preamble may be suppressed */ -#define BMSR_EXTENDED_STATUS 0x0100 /* Ext. status info in Reg 0x0F */ -#define BMSR_100T2_HD_CAPS 0x0200 /* 100T2 Half Duplex Capable */ -#define BMSR_100T2_FD_CAPS 0x0400 /* 100T2 Full Duplex Capable */ -#define BMSR_10T_HD_CAPS 0x0800 /* 10T Half Duplex Capable */ -#define BMSR_10T_FD_CAPS 0x1000 /* 10T Full Duplex Capable */ -#define BMSR_100X_HD_CAPS 0x2000 /* 100X Half Duplex Capable */ -#define BMMII_SR_100X_FD_CAPS 0x4000 /* 100X Full Duplex Capable */ -#define BMMII_SR_100T4_CAPS 0x8000 /* 100T4 Capable */ - -#define MII_PHYSID1 0x02 -#define MII_PHYSID2 0x03 #define L1D_MPW_PHYID1 0xD01C /* V7 */ #define L1D_MPW_PHYID2 0xD01D /* V1-V6 */ #define L1D_MPW_PHYID3 0xD01E /* V8 */ /* Autoneg Advertisement Register */ -#define MII_ADVERTISE 0x04 -#define ADVERTISE_SPEED_MASK 0x01E0 -#define ADVERTISE_DEFAULT_CAP 0x0DE0 +#define ADVERTISE_DEFAULT_CAP \ + (ADVERTISE_ALL | ADVERTISE_PAUSE_CAP | ADVERTISE_PAUSE_ASYM) /* 1000BASE-T Control Register */ -#define MII_GIGA_CR 0x09 #define GIGA_CR_1000T_REPEATER_DTE 0x0400 /* 1=Repeater/switch device port 0=DTE device */ #define GIGA_CR_1000T_MS_VALUE 0x0800 /* 1=Configure PHY as Master 0=Configure PHY as Slave */ diff --git a/drivers/net/atl1e/atl1e_ethtool.c b/drivers/net/atl1e/atl1e_ethtool.c index 6943a6c3b94..1209297433b 100644 --- a/drivers/net/atl1e/atl1e_ethtool.c +++ b/drivers/net/atl1e/atl1e_ethtool.c @@ -95,18 +95,18 @@ static int atl1e_set_settings(struct net_device *netdev, ecmd->advertising = hw->autoneg_advertised | ADVERTISED_TP | ADVERTISED_Autoneg; - adv4 = hw->mii_autoneg_adv_reg & ~MII_AR_SPEED_MASK; + adv4 = hw->mii_autoneg_adv_reg & ~ADVERTISE_ALL; adv9 = hw->mii_1000t_ctrl_reg & ~MII_AT001_CR_1000T_SPEED_MASK; if (hw->autoneg_advertised & ADVERTISE_10_HALF) - adv4 |= MII_AR_10T_HD_CAPS; + adv4 |= ADVERTISE_10HALF; if (hw->autoneg_advertised & ADVERTISE_10_FULL) - adv4 |= MII_AR_10T_FD_CAPS; + adv4 |= ADVERTISE_10FULL; if (hw->autoneg_advertised & ADVERTISE_100_HALF) - adv4 |= MII_AR_100TX_HD_CAPS; + adv4 |= ADVERTISE_100HALF; if (hw->autoneg_advertised & ADVERTISE_100_FULL) - adv4 |= MII_AR_100TX_FD_CAPS; + adv4 |= ADVERTISE_100FULL; if (hw->autoneg_advertised & ADVERTISE_1000_FULL) - adv9 |= MII_AT001_CR_1000T_FD_CAPS; + adv9 |= ADVERTISE_1000FULL; if (adv4 != hw->mii_autoneg_adv_reg || adv9 != hw->mii_1000t_ctrl_reg) { diff --git a/drivers/net/atl1e/atl1e_hw.c b/drivers/net/atl1e/atl1e_hw.c index 76cc043def8..923063d2e5b 100644 --- a/drivers/net/atl1e/atl1e_hw.c +++ b/drivers/net/atl1e/atl1e_hw.c @@ -318,7 +318,7 @@ static int atl1e_phy_setup_autoneg_adv(struct atl1e_hw *hw) * Advertisement Register (Address 4) and the 1000 mb speed bits in * the 1000Base-T control Register (Address 9). */ - mii_autoneg_adv_reg &= ~MII_AR_SPEED_MASK; + mii_autoneg_adv_reg &= ~ADVERTISE_ALL; mii_1000t_ctrl_reg &= ~MII_AT001_CR_1000T_SPEED_MASK; /* @@ -327,44 +327,37 @@ static int atl1e_phy_setup_autoneg_adv(struct atl1e_hw *hw) */ switch (hw->media_type) { case MEDIA_TYPE_AUTO_SENSOR: - mii_autoneg_adv_reg |= (MII_AR_10T_HD_CAPS | - MII_AR_10T_FD_CAPS | - MII_AR_100TX_HD_CAPS | - MII_AR_100TX_FD_CAPS); - hw->autoneg_advertised = ADVERTISE_10_HALF | - ADVERTISE_10_FULL | - ADVERTISE_100_HALF | - ADVERTISE_100_FULL; + mii_autoneg_adv_reg |= ADVERTISE_ALL; + hw->autoneg_advertised = ADVERTISE_ALL; if (hw->nic_type == athr_l1e) { - mii_1000t_ctrl_reg |= - MII_AT001_CR_1000T_FD_CAPS; + mii_1000t_ctrl_reg |= ADVERTISE_1000FULL; hw->autoneg_advertised |= ADVERTISE_1000_FULL; } break; case MEDIA_TYPE_100M_FULL: - mii_autoneg_adv_reg |= MII_AR_100TX_FD_CAPS; + mii_autoneg_adv_reg |= ADVERTISE_100FULL; hw->autoneg_advertised = ADVERTISE_100_FULL; break; case MEDIA_TYPE_100M_HALF: - mii_autoneg_adv_reg |= MII_AR_100TX_HD_CAPS; + mii_autoneg_adv_reg |= ADVERTISE_100_HALF; hw->autoneg_advertised = ADVERTISE_100_HALF; break; case MEDIA_TYPE_10M_FULL: - mii_autoneg_adv_reg |= MII_AR_10T_FD_CAPS; + mii_autoneg_adv_reg |= ADVERTISE_10_FULL; hw->autoneg_advertised = ADVERTISE_10_FULL; break; default: - mii_autoneg_adv_reg |= MII_AR_10T_HD_CAPS; + mii_autoneg_adv_reg |= ADVERTISE_10_HALF; hw->autoneg_advertised = ADVERTISE_10_HALF; break; } /* flow control fixed to enable all */ - mii_autoneg_adv_reg |= (MII_AR_ASM_DIR | MII_AR_PAUSE); + mii_autoneg_adv_reg |= (ADVERTISE_PAUSE_ASYM | ADVERTISE_PAUSE_CAP); hw->mii_autoneg_adv_reg = mii_autoneg_adv_reg; hw->mii_1000t_ctrl_reg = mii_1000t_ctrl_reg; @@ -374,7 +367,7 @@ static int atl1e_phy_setup_autoneg_adv(struct atl1e_hw *hw) return ret_val; if (hw->nic_type == athr_l1e || hw->nic_type == athr_l2e_revA) { - ret_val = atl1e_write_phy_reg(hw, MII_AT001_CR, + ret_val = atl1e_write_phy_reg(hw, MII_CTRL1000, mii_1000t_ctrl_reg); if (ret_val) return ret_val; @@ -397,7 +390,7 @@ int atl1e_phy_commit(struct atl1e_hw *hw) int ret_val; u16 phy_data; - phy_data = MII_CR_RESET | MII_CR_AUTO_NEG_EN | MII_CR_RESTART_AUTO_NEG; + phy_data = BMCR_RESET | BMCR_ANENABLE | BMCR_ANRESTART; ret_val = atl1e_write_phy_reg(hw, MII_BMCR, phy_data); if (ret_val) { @@ -645,15 +638,14 @@ int atl1e_restart_autoneg(struct atl1e_hw *hw) return err; if (hw->nic_type == athr_l1e || hw->nic_type == athr_l2e_revA) { - err = atl1e_write_phy_reg(hw, MII_AT001_CR, + err = atl1e_write_phy_reg(hw, MII_CTRL1000, hw->mii_1000t_ctrl_reg); if (err) return err; } err = atl1e_write_phy_reg(hw, MII_BMCR, - MII_CR_RESET | MII_CR_AUTO_NEG_EN | - MII_CR_RESTART_AUTO_NEG); + BMCR_RESET | BMCR_ANENABLE | BMCR_ANRESTART); return err; } diff --git a/drivers/net/atl1e/atl1e_hw.h b/drivers/net/atl1e/atl1e_hw.h index 5ea2f4d86cf..74df16aef79 100644 --- a/drivers/net/atl1e/atl1e_hw.h +++ b/drivers/net/atl1e/atl1e_hw.h @@ -629,127 +629,24 @@ s32 atl1e_restart_autoneg(struct atl1e_hw *hw); /***************************** MII definition ***************************************/ /* PHY Common Register */ -#define MII_BMCR 0x00 -#define MII_BMSR 0x01 -#define MII_PHYSID1 0x02 -#define MII_PHYSID2 0x03 -#define MII_ADVERTISE 0x04 -#define MII_LPA 0x05 -#define MII_EXPANSION 0x06 -#define MII_AT001_CR 0x09 -#define MII_AT001_SR 0x0A -#define MII_AT001_ESR 0x0F #define MII_AT001_PSCR 0x10 #define MII_AT001_PSSR 0x11 #define MII_INT_CTRL 0x12 #define MII_INT_STATUS 0x13 #define MII_SMARTSPEED 0x14 -#define MII_RERRCOUNTER 0x15 -#define MII_SREVISION 0x16 -#define MII_RESV1 0x17 #define MII_LBRERROR 0x18 -#define MII_PHYADDR 0x19 #define MII_RESV2 0x1a -#define MII_TPISTATUS 0x1b -#define MII_NCONFIG 0x1c #define MII_DBG_ADDR 0x1D #define MII_DBG_DATA 0x1E - -/* PHY Control Register */ -#define MII_CR_SPEED_SELECT_MSB 0x0040 /* bits 6,13: 10=1000, 01=100, 00=10 */ -#define MII_CR_COLL_TEST_ENABLE 0x0080 /* Collision test enable */ -#define MII_CR_FULL_DUPLEX 0x0100 /* FDX =1, half duplex =0 */ -#define MII_CR_RESTART_AUTO_NEG 0x0200 /* Restart auto negotiation */ -#define MII_CR_ISOLATE 0x0400 /* Isolate PHY from MII */ -#define MII_CR_POWER_DOWN 0x0800 /* Power down */ -#define MII_CR_AUTO_NEG_EN 0x1000 /* Auto Neg Enable */ -#define MII_CR_SPEED_SELECT_LSB 0x2000 /* bits 6,13: 10=1000, 01=100, 00=10 */ -#define MII_CR_LOOPBACK 0x4000 /* 0 = normal, 1 = loopback */ -#define MII_CR_RESET 0x8000 /* 0 = normal, 1 = PHY reset */ -#define MII_CR_SPEED_MASK 0x2040 -#define MII_CR_SPEED_1000 0x0040 -#define MII_CR_SPEED_100 0x2000 -#define MII_CR_SPEED_10 0x0000 - - -/* PHY Status Register */ -#define MII_SR_EXTENDED_CAPS 0x0001 /* Extended register capabilities */ -#define MII_SR_JABBER_DETECT 0x0002 /* Jabber Detected */ -#define MII_SR_LINK_STATUS 0x0004 /* Link Status 1 = link */ -#define MII_SR_AUTONEG_CAPS 0x0008 /* Auto Neg Capable */ -#define MII_SR_REMOTE_FAULT 0x0010 /* Remote Fault Detect */ -#define MII_SR_AUTONEG_COMPLETE 0x0020 /* Auto Neg Complete */ -#define MII_SR_PREAMBLE_SUPPRESS 0x0040 /* Preamble may be suppressed */ -#define MII_SR_EXTENDED_STATUS 0x0100 /* Ext. status info in Reg 0x0F */ -#define MII_SR_100T2_HD_CAPS 0x0200 /* 100T2 Half Duplex Capable */ -#define MII_SR_100T2_FD_CAPS 0x0400 /* 100T2 Full Duplex Capable */ -#define MII_SR_10T_HD_CAPS 0x0800 /* 10T Half Duplex Capable */ -#define MII_SR_10T_FD_CAPS 0x1000 /* 10T Full Duplex Capable */ -#define MII_SR_100X_HD_CAPS 0x2000 /* 100X Half Duplex Capable */ -#define MII_SR_100X_FD_CAPS 0x4000 /* 100X Full Duplex Capable */ -#define MII_SR_100T4_CAPS 0x8000 /* 100T4 Capable */ - -/* Link partner ability register. */ -#define MII_LPA_SLCT 0x001f /* Same as advertise selector */ -#define MII_LPA_10HALF 0x0020 /* Can do 10mbps half-duplex */ -#define MII_LPA_10FULL 0x0040 /* Can do 10mbps full-duplex */ -#define MII_LPA_100HALF 0x0080 /* Can do 100mbps half-duplex */ -#define MII_LPA_100FULL 0x0100 /* Can do 100mbps full-duplex */ -#define MII_LPA_100BASE4 0x0200 /* 100BASE-T4 */ -#define MII_LPA_PAUSE 0x0400 /* PAUSE */ -#define MII_LPA_ASYPAUSE 0x0800 /* Asymmetrical PAUSE */ -#define MII_LPA_RFAULT 0x2000 /* Link partner faulted */ -#define MII_LPA_LPACK 0x4000 /* Link partner acked us */ -#define MII_LPA_NPAGE 0x8000 /* Next page bit */ - /* Autoneg Advertisement Register */ -#define MII_AR_SELECTOR_FIELD 0x0001 /* indicates IEEE 802.3 CSMA/CD */ -#define MII_AR_10T_HD_CAPS 0x0020 /* 10T Half Duplex Capable */ -#define MII_AR_10T_FD_CAPS 0x0040 /* 10T Full Duplex Capable */ -#define MII_AR_100TX_HD_CAPS 0x0080 /* 100TX Half Duplex Capable */ -#define MII_AR_100TX_FD_CAPS 0x0100 /* 100TX Full Duplex Capable */ -#define MII_AR_100T4_CAPS 0x0200 /* 100T4 Capable */ -#define MII_AR_PAUSE 0x0400 /* Pause operation desired */ -#define MII_AR_ASM_DIR 0x0800 /* Asymmetric Pause Direction bit */ -#define MII_AR_REMOTE_FAULT 0x2000 /* Remote Fault detected */ -#define MII_AR_NEXT_PAGE 0x8000 /* Next Page ability supported */ -#define MII_AR_SPEED_MASK 0x01E0 -#define MII_AR_DEFAULT_CAP_MASK 0x0DE0 +#define MII_AR_DEFAULT_CAP_MASK 0 /* 1000BASE-T Control Register */ -#define MII_AT001_CR_1000T_HD_CAPS 0x0100 /* Advertise 1000T HD capability */ -#define MII_AT001_CR_1000T_FD_CAPS 0x0200 /* Advertise 1000T FD capability */ -#define MII_AT001_CR_1000T_REPEATER_DTE 0x0400 /* 1=Repeater/switch device port */ -/* 0=DTE device */ -#define MII_AT001_CR_1000T_MS_VALUE 0x0800 /* 1=Configure PHY as Master */ -/* 0=Configure PHY as Slave */ -#define MII_AT001_CR_1000T_MS_ENABLE 0x1000 /* 1=Master/Slave manual config value */ -/* 0=Automatic Master/Slave config */ -#define MII_AT001_CR_1000T_TEST_MODE_NORMAL 0x0000 /* Normal Operation */ -#define MII_AT001_CR_1000T_TEST_MODE_1 0x2000 /* Transmit Waveform test */ -#define MII_AT001_CR_1000T_TEST_MODE_2 0x4000 /* Master Transmit Jitter test */ -#define MII_AT001_CR_1000T_TEST_MODE_3 0x6000 /* Slave Transmit Jitter test */ -#define MII_AT001_CR_1000T_TEST_MODE_4 0x8000 /* Transmitter Distortion test */ -#define MII_AT001_CR_1000T_SPEED_MASK 0x0300 -#define MII_AT001_CR_1000T_DEFAULT_CAP_MASK 0x0300 - -/* 1000BASE-T Status Register */ -#define MII_AT001_SR_1000T_LP_HD_CAPS 0x0400 /* LP is 1000T HD capable */ -#define MII_AT001_SR_1000T_LP_FD_CAPS 0x0800 /* LP is 1000T FD capable */ -#define MII_AT001_SR_1000T_REMOTE_RX_STATUS 0x1000 /* Remote receiver OK */ -#define MII_AT001_SR_1000T_LOCAL_RX_STATUS 0x2000 /* Local receiver OK */ -#define MII_AT001_SR_1000T_MS_CONFIG_RES 0x4000 /* 1=Local TX is Master, 0=Slave */ -#define MII_AT001_SR_1000T_MS_CONFIG_FAULT 0x8000 /* Master/Slave config fault */ -#define MII_AT001_SR_1000T_REMOTE_RX_STATUS_SHIFT 12 -#define MII_AT001_SR_1000T_LOCAL_RX_STATUS_SHIFT 13 - -/* Extended Status Register */ -#define MII_AT001_ESR_1000T_HD_CAPS 0x1000 /* 1000T HD capable */ -#define MII_AT001_ESR_1000T_FD_CAPS 0x2000 /* 1000T FD capable */ -#define MII_AT001_ESR_1000X_HD_CAPS 0x4000 /* 1000X HD capable */ -#define MII_AT001_ESR_1000X_FD_CAPS 0x8000 /* 1000X FD capable */ +#define MII_AT001_CR_1000T_SPEED_MASK \ + (ADVERTISE_1000FULL | ADVERTISE_1000HALF) +#define MII_AT001_CR_1000T_DEFAULT_CAP_MASK MII_AT001_CR_1000T_SPEED_MASK /* AT001 PHY Specific Control Register */ #define MII_AT001_PSCR_JABBER_DISABLE 0x0001 /* 1=Jabber Function disabled */ diff --git a/drivers/net/atl1e/atl1e_main.c b/drivers/net/atl1e/atl1e_main.c index e28f8baf394..bf7500ccd73 100644 --- a/drivers/net/atl1e/atl1e_main.c +++ b/drivers/net/atl1e/atl1e_main.c @@ -2051,9 +2051,9 @@ static int atl1e_suspend(struct pci_dev *pdev, pm_message_t state) atl1e_read_phy_reg(hw, MII_BMSR, (u16 *)&mii_bmsr_data); atl1e_read_phy_reg(hw, MII_BMSR, (u16 *)&mii_bmsr_data); - mii_advertise_data = MII_AR_10T_HD_CAPS; + mii_advertise_data = ADVERTISE_10HALF; - if ((atl1e_write_phy_reg(hw, MII_AT001_CR, 0) != 0) || + if ((atl1e_write_phy_reg(hw, MII_CTRL1000, 0) != 0) || (atl1e_write_phy_reg(hw, MII_ADVERTISE, mii_advertise_data) != 0) || (atl1e_phy_commit(hw)) != 0) { diff --git a/drivers/net/bonding/bond_3ad.c b/drivers/net/bonding/bond_3ad.c index 171782e2bb3..1024ae15822 100644 --- a/drivers/net/bonding/bond_3ad.c +++ b/drivers/net/bonding/bond_3ad.c @@ -2470,6 +2470,10 @@ int bond_3ad_lacpdu_recv(struct sk_buff *skb, struct net_device *dev, struct pac if (!(dev->flags & IFF_MASTER)) goto out; + skb = skb_share_check(skb, GFP_ATOMIC); + if (!skb) + goto out; + if (!pskb_may_pull(skb, sizeof(struct lacpdu))) goto out; diff --git a/drivers/net/bonding/bond_alb.c b/drivers/net/bonding/bond_alb.c index f4e638c6512..5c6fba802f2 100644 --- a/drivers/net/bonding/bond_alb.c +++ b/drivers/net/bonding/bond_alb.c @@ -326,6 +326,10 @@ static int rlb_arp_recv(struct sk_buff *skb, struct net_device *bond_dev, struct goto out; } + skb = skb_share_check(skb, GFP_ATOMIC); + if (!skb) + goto out; + if (!pskb_may_pull(skb, arp_hdr_len(bond_dev))) goto out; diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index b1025b85acf..163e0b06eaa 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -2733,6 +2733,10 @@ static int bond_arp_rcv(struct sk_buff *skb, struct net_device *dev, struct pack if (!slave || !slave_do_arp_validate(bond, slave)) goto out_unlock; + skb = skb_share_check(skb, GFP_ATOMIC); + if (!skb) + goto out_unlock; + if (!pskb_may_pull(skb, arp_hdr_len(dev))) goto out_unlock; diff --git a/drivers/net/can/Kconfig b/drivers/net/can/Kconfig index d5a9db60ade..986195eaa57 100644 --- a/drivers/net/can/Kconfig +++ b/drivers/net/can/Kconfig @@ -117,6 +117,8 @@ source "drivers/net/can/sja1000/Kconfig" source "drivers/net/can/usb/Kconfig" +source "drivers/net/can/softing/Kconfig" + config CAN_DEBUG_DEVICES bool "CAN devices debugging messages" depends on CAN diff --git a/drivers/net/can/Makefile b/drivers/net/can/Makefile index 07ca159ba3f..53c82a71778 100644 --- a/drivers/net/can/Makefile +++ b/drivers/net/can/Makefile @@ -9,6 +9,7 @@ obj-$(CONFIG_CAN_DEV) += can-dev.o can-dev-y := dev.o obj-y += usb/ +obj-y += softing/ obj-$(CONFIG_CAN_SJA1000) += sja1000/ obj-$(CONFIG_CAN_MSCAN) += mscan/ diff --git a/drivers/net/can/softing/Kconfig b/drivers/net/can/softing/Kconfig new file mode 100644 index 00000000000..92bd6bdde5e --- /dev/null +++ b/drivers/net/can/softing/Kconfig @@ -0,0 +1,30 @@ +config CAN_SOFTING + tristate "Softing Gmbh CAN generic support" + depends on CAN_DEV + ---help--- + Support for CAN cards from Softing Gmbh & some cards + from Vector Gmbh. + Softing Gmbh CAN cards come with 1 or 2 physical busses. + Those cards typically use Dual Port RAM to communicate + with the host CPU. The interface is then identical for PCI + and PCMCIA cards. This driver operates on a platform device, + which has been created by softing_cs or softing_pci driver. + Warning: + The API of the card does not allow fine control per bus, but + controls the 2 busses on the card together. + As such, some actions (start/stop/busoff recovery) on 1 bus + must bring down the other bus too temporarily. + +config CAN_SOFTING_CS + tristate "Softing Gmbh CAN pcmcia cards" + depends on PCMCIA + select CAN_SOFTING + ---help--- + Support for PCMCIA cards from Softing Gmbh & some cards + from Vector Gmbh. + You need firmware for these, which you can get at + http://developer.berlios.de/projects/socketcan/ + This version of the driver is written against + firmware version 4.6 (softing-fw-4.6-binaries.tar.gz) + In order to use the card as CAN device, you need the Softing generic + support too. diff --git a/drivers/net/can/softing/Makefile b/drivers/net/can/softing/Makefile new file mode 100644 index 00000000000..c5e5016c742 --- /dev/null +++ b/drivers/net/can/softing/Makefile @@ -0,0 +1,6 @@ + +softing-y := softing_main.o softing_fw.o +obj-$(CONFIG_CAN_SOFTING) += softing.o +obj-$(CONFIG_CAN_SOFTING_CS) += softing_cs.o + +ccflags-$(CONFIG_CAN_DEBUG_DEVICES) := -DDEBUG diff --git a/drivers/net/can/softing/softing.h b/drivers/net/can/softing/softing.h new file mode 100644 index 00000000000..7ec9f4db3d5 --- /dev/null +++ b/drivers/net/can/softing/softing.h @@ -0,0 +1,167 @@ +/* + * softing common interfaces + * + * by Kurt Van Dijck, 2008-2010 + */ + +#include <linux/atomic.h> +#include <linux/netdevice.h> +#include <linux/ktime.h> +#include <linux/mutex.h> +#include <linux/spinlock.h> +#include <linux/can.h> +#include <linux/can/dev.h> + +#include "softing_platform.h" + +struct softing; + +struct softing_priv { + struct can_priv can; /* must be the first member! */ + struct net_device *netdev; + struct softing *card; + struct { + int pending; + /* variables wich hold the circular buffer */ + int echo_put; + int echo_get; + } tx; + struct can_bittiming_const btr_const; + int index; + uint8_t output; + uint16_t chip; +}; +#define netdev2softing(netdev) ((struct softing_priv *)netdev_priv(netdev)) + +struct softing { + const struct softing_platform_data *pdat; + struct platform_device *pdev; + struct net_device *net[2]; + spinlock_t spin; /* protect this structure & DPRAM access */ + ktime_t ts_ref; + ktime_t ts_overflow; /* timestamp overflow value, in ktime */ + + struct { + /* indication of firmware status */ + int up; + /* protection of the 'up' variable */ + struct mutex lock; + } fw; + struct { + int nr; + int requested; + int svc_count; + unsigned int dpram_position; + } irq; + struct { + int pending; + int last_bus; + /* + * keep the bus that last tx'd a message, + * in order to let every netdev queue resume + */ + } tx; + __iomem uint8_t *dpram; + unsigned long dpram_phys; + unsigned long dpram_size; + struct { + uint16_t fw_version, hw_version, license, serial; + uint16_t chip[2]; + unsigned int freq; /* remote cpu's operating frequency */ + } id; +}; + +extern int softing_default_output(struct net_device *netdev); + +extern ktime_t softing_raw2ktime(struct softing *card, u32 raw); + +extern int softing_chip_poweron(struct softing *card); + +extern int softing_bootloader_command(struct softing *card, int16_t cmd, + const char *msg); + +/* Load firmware after reset */ +extern int softing_load_fw(const char *file, struct softing *card, + __iomem uint8_t *virt, unsigned int size, int offset); + +/* Load final application firmware after bootloader */ +extern int softing_load_app_fw(const char *file, struct softing *card); + +/* + * enable or disable irq + * only called with fw.lock locked + */ +extern int softing_enable_irq(struct softing *card, int enable); + +/* start/stop 1 bus on card */ +extern int softing_startstop(struct net_device *netdev, int up); + +/* netif_rx() */ +extern int softing_netdev_rx(struct net_device *netdev, + const struct can_frame *msg, ktime_t ktime); + +/* SOFTING DPRAM mappings */ +#define DPRAM_RX 0x0000 + #define DPRAM_RX_SIZE 32 + #define DPRAM_RX_CNT 16 +#define DPRAM_RX_RD 0x0201 /* uint8_t */ +#define DPRAM_RX_WR 0x0205 /* uint8_t */ +#define DPRAM_RX_LOST 0x0207 /* uint8_t */ + +#define DPRAM_FCT_PARAM 0x0300 /* int16_t [20] */ +#define DPRAM_FCT_RESULT 0x0328 /* int16_t */ +#define DPRAM_FCT_HOST 0x032b /* uint16_t */ + +#define DPRAM_INFO_BUSSTATE 0x0331 /* uint16_t */ +#define DPRAM_INFO_BUSSTATE2 0x0335 /* uint16_t */ +#define DPRAM_INFO_ERRSTATE 0x0339 /* uint16_t */ +#define DPRAM_INFO_ERRSTATE2 0x033d /* uint16_t */ +#define DPRAM_RESET 0x0341 /* uint16_t */ +#define DPRAM_CLR_RECV_FIFO 0x0345 /* uint16_t */ +#define DPRAM_RESET_TIME 0x034d /* uint16_t */ +#define DPRAM_TIME 0x0350 /* uint64_t */ +#define DPRAM_WR_START 0x0358 /* uint8_t */ +#define DPRAM_WR_END 0x0359 /* uint8_t */ +#define DPRAM_RESET_RX_FIFO 0x0361 /* uint16_t */ +#define DPRAM_RESET_TX_FIFO 0x0364 /* uint8_t */ +#define DPRAM_READ_FIFO_LEVEL 0x0365 /* uint8_t */ +#define DPRAM_RX_FIFO_LEVEL 0x0366 /* uint16_t */ +#define DPRAM_TX_FIFO_LEVEL 0x0366 /* uint16_t */ + +#define DPRAM_TX 0x0400 /* uint16_t */ + #define DPRAM_TX_SIZE 16 + #define DPRAM_TX_CNT 32 +#define DPRAM_TX_RD 0x0601 /* uint8_t */ +#define DPRAM_TX_WR 0x0605 /* uint8_t */ + +#define DPRAM_COMMAND 0x07e0 /* uint16_t */ +#define DPRAM_RECEIPT 0x07f0 /* uint16_t */ +#define DPRAM_IRQ_TOHOST 0x07fe /* uint8_t */ +#define DPRAM_IRQ_TOCARD 0x07ff /* uint8_t */ + +#define DPRAM_V2_RESET 0x0e00 /* uint8_t */ +#define DPRAM_V2_IRQ_TOHOST 0x0e02 /* uint8_t */ + +#define TXMAX (DPRAM_TX_CNT - 1) + +/* DPRAM return codes */ +#define RES_NONE 0 +#define RES_OK 1 +#define RES_NOK 2 +#define RES_UNKNOWN 3 +/* DPRAM flags */ +#define CMD_TX 0x01 +#define CMD_ACK 0x02 +#define CMD_XTD 0x04 +#define CMD_RTR 0x08 +#define CMD_ERR 0x10 +#define CMD_BUS2 0x80 + +/* returned fifo entry bus state masks */ +#define SF_MASK_BUSOFF 0x80 +#define SF_MASK_EPASSIVE 0x60 + +/* bus states */ +#define STATE_BUSOFF 2 +#define STATE_EPASSIVE 1 +#define STATE_EACTIVE 0 diff --git a/drivers/net/can/softing/softing_cs.c b/drivers/net/can/softing/softing_cs.c new file mode 100644 index 00000000000..300fe75dd1a --- /dev/null +++ b/drivers/net/can/softing/softing_cs.c @@ -0,0 +1,359 @@ +/* + * Copyright (C) 2008-2010 + * + * - Kurt Van Dijck, EIA Electronics + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the version 2 of the GNU General Public License + * as published by the Free Software Foundation + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/module.h> +#include <linux/kernel.h> + +#include <pcmcia/cistpl.h> +#include <pcmcia/ds.h> + +#include "softing_platform.h" + +static int softingcs_index; +static spinlock_t softingcs_index_lock; + +static int softingcs_reset(struct platform_device *pdev, int v); +static int softingcs_enable_irq(struct platform_device *pdev, int v); + +/* + * platform_data descriptions + */ +#define MHZ (1000*1000) +static const struct softing_platform_data softingcs_platform_data[] = { +{ + .name = "CANcard", + .manf = 0x0168, .prod = 0x001, + .generation = 1, + .nbus = 2, + .freq = 16 * MHZ, .max_brp = 32, .max_sjw = 4, + .dpram_size = 0x0800, + .boot = {0x0000, 0x000000, fw_dir "bcard.bin",}, + .load = {0x0120, 0x00f600, fw_dir "ldcard.bin",}, + .app = {0x0010, 0x0d0000, fw_dir "cancard.bin",}, + .reset = softingcs_reset, + .enable_irq = softingcs_enable_irq, +}, { + .name = "CANcard-NEC", + .manf = 0x0168, .prod = 0x002, + .generation = 1, + .nbus = 2, + .freq = 16 * MHZ, .max_brp = 32, .max_sjw = 4, + .dpram_size = 0x0800, + .boot = {0x0000, 0x000000, fw_dir "bcard.bin",}, + .load = {0x0120, 0x00f600, fw_dir "ldcard.bin",}, + .app = {0x0010, 0x0d0000, fw_dir "cancard.bin",}, + .reset = softingcs_reset, + .enable_irq = softingcs_enable_irq, +}, { + .name = "CANcard-SJA", + .manf = 0x0168, .prod = 0x004, + .generation = 1, + .nbus = 2, + .freq = 20 * MHZ, .max_brp = 32, .max_sjw = 4, + .dpram_size = 0x0800, + .boot = {0x0000, 0x000000, fw_dir "bcard.bin",}, + .load = {0x0120, 0x00f600, fw_dir "ldcard.bin",}, + .app = {0x0010, 0x0d0000, fw_dir "cansja.bin",}, + .reset = softingcs_reset, + .enable_irq = softingcs_enable_irq, +}, { + .name = "CANcard-2", + .manf = 0x0168, .prod = 0x005, + .generation = 2, + .nbus = 2, + .freq = 24 * MHZ, .max_brp = 64, .max_sjw = 4, + .dpram_size = 0x1000, + .boot = {0x0000, 0x000000, fw_dir "bcard2.bin",}, + .load = {0x0120, 0x00f600, fw_dir "ldcard2.bin",}, + .app = {0x0010, 0x0d0000, fw_dir "cancrd2.bin",}, + .reset = softingcs_reset, + .enable_irq = NULL, +}, { + .name = "Vector-CANcard", + .manf = 0x0168, .prod = 0x081, + .generation = 1, + .nbus = 2, + .freq = 16 * MHZ, .max_brp = 64, .max_sjw = 4, + .dpram_size = 0x0800, + .boot = {0x0000, 0x000000, fw_dir "bcard.bin",}, + .load = {0x0120, 0x00f600, fw_dir "ldcard.bin",}, + .app = {0x0010, 0x0d0000, fw_dir "cancard.bin",}, + .reset = softingcs_reset, + .enable_irq = softingcs_enable_irq, +}, { + .name = "Vector-CANcard-SJA", + .manf = 0x0168, .prod = 0x084, + .generation = 1, + .nbus = 2, + .freq = 20 * MHZ, .max_brp = 32, .max_sjw = 4, + .dpram_size = 0x0800, + .boot = {0x0000, 0x000000, fw_dir "bcard.bin",}, + .load = {0x0120, 0x00f600, fw_dir "ldcard.bin",}, + .app = {0x0010, 0x0d0000, fw_dir "cansja.bin",}, + .reset = softingcs_reset, + .enable_irq = softingcs_enable_irq, +}, { + .name = "Vector-CANcard-2", + .manf = 0x0168, .prod = 0x085, + .generation = 2, + .nbus = 2, + .freq = 24 * MHZ, .max_brp = 64, .max_sjw = 4, + .dpram_size = 0x1000, + .boot = {0x0000, 0x000000, fw_dir "bcard2.bin",}, + .load = {0x0120, 0x00f600, fw_dir "ldcard2.bin",}, + .app = {0x0010, 0x0d0000, fw_dir "cancrd2.bin",}, + .reset = softingcs_reset, + .enable_irq = NULL, +}, { + .name = "EDICcard-NEC", + .manf = 0x0168, .prod = 0x102, + .generation = 1, + .nbus = 2, + .freq = 16 * MHZ, .max_brp = 64, .max_sjw = 4, + .dpram_size = 0x0800, + .boot = {0x0000, 0x000000, fw_dir "bcard.bin",}, + .load = {0x0120, 0x00f600, fw_dir "ldcard.bin",}, + .app = {0x0010, 0x0d0000, fw_dir "cancard.bin",}, + .reset = softingcs_reset, + .enable_irq = softingcs_enable_irq, +}, { + .name = "EDICcard-2", + .manf = 0x0168, .prod = 0x105, + .generation = 2, + .nbus = 2, + .freq = 24 * MHZ, .max_brp = 64, .max_sjw = 4, + .dpram_size = 0x1000, + .boot = {0x0000, 0x000000, fw_dir "bcard2.bin",}, + .load = {0x0120, 0x00f600, fw_dir "ldcard2.bin",}, + .app = {0x0010, 0x0d0000, fw_dir "cancrd2.bin",}, + .reset = softingcs_reset, + .enable_irq = NULL, +}, { + 0, 0, +}, +}; + +MODULE_FIRMWARE(fw_dir "bcard.bin"); +MODULE_FIRMWARE(fw_dir "ldcard.bin"); +MODULE_FIRMWARE(fw_dir "cancard.bin"); +MODULE_FIRMWARE(fw_dir "cansja.bin"); + +MODULE_FIRMWARE(fw_dir "bcard2.bin"); +MODULE_FIRMWARE(fw_dir "ldcard2.bin"); +MODULE_FIRMWARE(fw_dir "cancrd2.bin"); + +static __devinit const struct softing_platform_data +*softingcs_find_platform_data(unsigned int manf, unsigned int prod) +{ + const struct softing_platform_data *lp; + + for (lp = softingcs_platform_data; lp->manf; ++lp) { + if ((lp->manf == manf) && (lp->prod == prod)) + return lp; + } + return NULL; +} + +/* + * platformdata callbacks + */ +static int softingcs_reset(struct platform_device *pdev, int v) +{ + struct pcmcia_device *pcmcia = to_pcmcia_dev(pdev->dev.parent); + + dev_dbg(&pdev->dev, "pcmcia config [2] %02x\n", v ? 0 : 0x20); + return pcmcia_write_config_byte(pcmcia, 2, v ? 0 : 0x20); +} + +static int softingcs_enable_irq(struct platform_device *pdev, int v) +{ + struct pcmcia_device *pcmcia = to_pcmcia_dev(pdev->dev.parent); + + dev_dbg(&pdev->dev, "pcmcia config [0] %02x\n", v ? 0x60 : 0); + return pcmcia_write_config_byte(pcmcia, 0, v ? 0x60 : 0); +} + +/* + * pcmcia check + */ +static __devinit int softingcs_probe_config(struct pcmcia_device *pcmcia, + void *priv_data) +{ + struct softing_platform_data *pdat = priv_data; + struct resource *pres; + int memspeed = 0; + + WARN_ON(!pdat); + pres = pcmcia->resource[PCMCIA_IOMEM_0]; + if (resource_size(pres) < 0x1000) + return -ERANGE; + + pres->flags |= WIN_MEMORY_TYPE_CM | WIN_ENABLE; + if (pdat->generation < 2) { + pres->flags |= WIN_USE_WAIT | WIN_DATA_WIDTH_8; + memspeed = 3; + } else { + pres->flags |= WIN_DATA_WIDTH_16; + } + return pcmcia_request_window(pcmcia, pres, memspeed); +} + +static __devexit void softingcs_remove(struct pcmcia_device *pcmcia) +{ + struct platform_device *pdev = pcmcia->priv; + + /* free bits */ + platform_device_unregister(pdev); + /* release pcmcia stuff */ + pcmcia_disable_device(pcmcia); +} + +/* + * platform_device wrapper + * pdev->resource has 2 entries: io & irq + */ +static void softingcs_pdev_release(struct device *dev) +{ + struct platform_device *pdev = to_platform_device(dev); + kfree(pdev); +} + +static __devinit int softingcs_probe(struct pcmcia_device *pcmcia) +{ + int ret; + struct platform_device *pdev; + const struct softing_platform_data *pdat; + struct resource *pres; + struct dev { + struct platform_device pdev; + struct resource res[2]; + } *dev; + + /* find matching platform_data */ + pdat = softingcs_find_platform_data(pcmcia->manf_id, pcmcia->card_id); + if (!pdat) + return -ENOTTY; + + /* setup pcmcia device */ + pcmcia->config_flags |= CONF_ENABLE_IRQ | CONF_AUTO_SET_IOMEM | + CONF_AUTO_SET_VPP | CONF_AUTO_CHECK_VCC; + ret = pcmcia_loop_config(pcmcia, softingcs_probe_config, (void *)pdat); + if (ret) + goto pcmcia_failed; + + ret = pcmcia_enable_device(pcmcia); + if (ret < 0) + goto pcmcia_failed; + + pres = pcmcia->resource[PCMCIA_IOMEM_0]; + if (!pres) { + ret = -EBADF; + goto pcmcia_bad; + } + + /* create softing platform device */ + dev = kzalloc(sizeof(*dev), GFP_KERNEL); + if (!dev) { + ret = -ENOMEM; + goto mem_failed; + } + dev->pdev.resource = dev->res; + dev->pdev.num_resources = ARRAY_SIZE(dev->res); + dev->pdev.dev.release = softingcs_pdev_release; + + pdev = &dev->pdev; + pdev->dev.platform_data = (void *)pdat; + pdev->dev.parent = &pcmcia->dev; + pcmcia->priv = pdev; + + /* platform device resources */ + pdev->resource[0].flags = IORESOURCE_MEM; + pdev->resource[0].start = pres->start; + pdev->resource[0].end = pres->end; + + pdev->resource[1].flags = IORESOURCE_IRQ; + pdev->resource[1].start = pcmcia->irq; + pdev->resource[1].end = pdev->resource[1].start; + + /* platform device setup */ + spin_lock(&softingcs_index_lock); + pdev->id = softingcs_index++; + spin_unlock(&softingcs_index_lock); + pdev->name = "softing"; + dev_set_name(&pdev->dev, "softingcs.%i", pdev->id); + ret = platform_device_register(pdev); + if (ret < 0) + goto platform_failed; + + dev_info(&pcmcia->dev, "created %s\n", dev_name(&pdev->dev)); + return 0; + +platform_failed: + kfree(dev); +mem_failed: +pcmcia_bad: +pcmcia_failed: + pcmcia_disable_device(pcmcia); + pcmcia->priv = NULL; + return ret ?: -ENODEV; +} + +static /*const*/ struct pcmcia_device_id softingcs_ids[] = { + /* softing */ + PCMCIA_DEVICE_MANF_CARD(0x0168, 0x0001), + PCMCIA_DEVICE_MANF_CARD(0x0168, 0x0002), + PCMCIA_DEVICE_MANF_CARD(0x0168, 0x0004), + PCMCIA_DEVICE_MANF_CARD(0x0168, 0x0005), + /* vector, manufacturer? */ + PCMCIA_DEVICE_MANF_CARD(0x0168, 0x0081), + PCMCIA_DEVICE_MANF_CARD(0x0168, 0x0084), + PCMCIA_DEVICE_MANF_CARD(0x0168, 0x0085), + /* EDIC */ + PCMCIA_DEVICE_MANF_CARD(0x0168, 0x0102), + PCMCIA_DEVICE_MANF_CARD(0x0168, 0x0105), + PCMCIA_DEVICE_NULL, +}; + +MODULE_DEVICE_TABLE(pcmcia, softingcs_ids); + +static struct pcmcia_driver softingcs_driver = { + .owner = THIS_MODULE, + .name = "softingcs", + .id_table = softingcs_ids, + .probe = softingcs_probe, + .remove = __devexit_p(softingcs_remove), +}; + +static int __init softingcs_start(void) +{ + spin_lock_init(&softingcs_index_lock); + return pcmcia_register_driver(&softingcs_driver); +} + +static void __exit softingcs_stop(void) +{ + pcmcia_unregister_driver(&softingcs_driver); +} + +module_init(softingcs_start); +module_exit(softingcs_stop); + +MODULE_DESCRIPTION("softing CANcard driver" + ", links PCMCIA card to softing driver"); +MODULE_LICENSE("GPL v2"); diff --git a/drivers/net/can/softing/softing_fw.c b/drivers/net/can/softing/softing_fw.c new file mode 100644 index 00000000000..b520784fb19 --- /dev/null +++ b/drivers/net/can/softing/softing_fw.c @@ -0,0 +1,691 @@ +/* + * Copyright (C) 2008-2010 + * + * - Kurt Van Dijck, EIA Electronics + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the version 2 of the GNU General Public License + * as published by the Free Software Foundation + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/firmware.h> +#include <linux/sched.h> +#include <asm/div64.h> + +#include "softing.h" + +/* + * low level DPRAM command. + * Make sure that card->dpram[DPRAM_FCT_HOST] is preset + */ +static int _softing_fct_cmd(struct softing *card, int16_t cmd, uint16_t vector, + const char *msg) +{ + int ret; + unsigned long stamp; + + iowrite16(cmd, &card->dpram[DPRAM_FCT_PARAM]); + iowrite8(vector >> 8, &card->dpram[DPRAM_FCT_HOST + 1]); + iowrite8(vector, &card->dpram[DPRAM_FCT_HOST]); + /* be sure to flush this to the card */ + wmb(); + stamp = jiffies + 1 * HZ; + /* wait for card */ + do { + /* DPRAM_FCT_HOST is _not_ aligned */ + ret = ioread8(&card->dpram[DPRAM_FCT_HOST]) + + (ioread8(&card->dpram[DPRAM_FCT_HOST + 1]) << 8); + /* don't have any cached variables */ + rmb(); + if (ret == RES_OK) + /* read return-value now */ + return ioread16(&card->dpram[DPRAM_FCT_RESULT]); + + if ((ret != vector) || time_after(jiffies, stamp)) + break; + /* process context => relax */ + usleep_range(500, 10000); + } while (1); + + ret = (ret == RES_NONE) ? -ETIMEDOUT : -ECANCELED; + dev_alert(&card->pdev->dev, "firmware %s failed (%i)\n", msg, ret); + return ret; +} + +static int softing_fct_cmd(struct softing *card, int16_t cmd, const char *msg) +{ + int ret; + + ret = _softing_fct_cmd(card, cmd, 0, msg); + if (ret > 0) { + dev_alert(&card->pdev->dev, "%s returned %u\n", msg, ret); + ret = -EIO; + } + return ret; +} + +int softing_bootloader_command(struct softing *card, int16_t cmd, + const char *msg) +{ + int ret; + unsigned long stamp; + + iowrite16(RES_NONE, &card->dpram[DPRAM_RECEIPT]); + iowrite16(cmd, &card->dpram[DPRAM_COMMAND]); + /* be sure to flush this to the card */ + wmb(); + stamp = jiffies + 3 * HZ; + /* wait for card */ + do { + ret = ioread16(&card->dpram[DPRAM_RECEIPT]); + /* don't have any cached variables */ + rmb(); + if (ret == RES_OK) + return 0; + if (time_after(jiffies, stamp)) + break; + /* process context => relax */ + usleep_range(500, 10000); + } while (!signal_pending(current)); + + ret = (ret == RES_NONE) ? -ETIMEDOUT : -ECANCELED; + dev_alert(&card->pdev->dev, "bootloader %s failed (%i)\n", msg, ret); + return ret; +} + +static int fw_parse(const uint8_t **pmem, uint16_t *ptype, uint32_t *paddr, + uint16_t *plen, const uint8_t **pdat) +{ + uint16_t checksum[2]; + const uint8_t *mem; + const uint8_t *end; + + /* + * firmware records are a binary, unaligned stream composed of: + * uint16_t type; + * uint32_t addr; + * uint16_t len; + * uint8_t dat[len]; + * uint16_t checksum; + * all values in little endian. + * We could define a struct for this, with __attribute__((packed)), + * but would that solve the alignment in _all_ cases (cfr. the + * struct itself may be an odd address)? + * + * I chose to use leXX_to_cpup() since this solves both + * endianness & alignment. + */ + mem = *pmem; + *ptype = le16_to_cpup((void *)&mem[0]); + *paddr = le32_to_cpup((void *)&mem[2]); + *plen = le16_to_cpup((void *)&mem[6]); + *pdat = &mem[8]; + /* verify checksum */ + end = &mem[8 + *plen]; + checksum[0] = le16_to_cpup((void *)end); + for (checksum[1] = 0; mem < end; ++mem) + checksum[1] += *mem; + if (checksum[0] != checksum[1]) + return -EINVAL; + /* increment */ + *pmem += 10 + *plen; + return 0; +} + +int softing_load_fw(const char *file, struct softing *card, + __iomem uint8_t *dpram, unsigned int size, int offset) +{ + const struct firmware *fw; + int ret; + const uint8_t *mem, *end, *dat; + uint16_t type, len; + uint32_t addr; + uint8_t *buf = NULL; + int buflen = 0; + int8_t type_end = 0; + + ret = request_firmware(&fw, file, &card->pdev->dev); + if (ret < 0) + return ret; + dev_dbg(&card->pdev->dev, "%s, firmware(%s) got %u bytes" + ", offset %c0x%04x\n", + card->pdat->name, file, (unsigned int)fw->size, + (offset >= 0) ? '+' : '-', (unsigned int)abs(offset)); + /* parse the firmware */ + mem = fw->data; + end = &mem[fw->size]; + /* look for header record */ + ret = fw_parse(&mem, &type, &addr, &len, &dat); + if (ret < 0) + goto failed; + if (type != 0xffff) + goto failed; + if (strncmp("Structured Binary Format, Softing GmbH" , dat, len)) { + ret = -EINVAL; + goto failed; + } + /* ok, we had a header */ + while (mem < end) { + ret = fw_parse(&mem, &type, &addr, &len, &dat); + if (ret < 0) + goto failed; + if (type == 3) { + /* start address, not used here */ + continue; + } else if (type == 1) { + /* eof */ + type_end = 1; + break; + } else if (type != 0) { + ret = -EINVAL; + goto failed; + } + + if ((addr + len + offset) > size) + goto failed; + memcpy_toio(&dpram[addr + offset], dat, len); + /* be sure to flush caches from IO space */ + mb(); + if (len > buflen) { + /* align buflen */ + buflen = (len + (1024-1)) & ~(1024-1); + buf = krealloc(buf, buflen, GFP_KERNEL); + if (!buf) { + ret = -ENOMEM; + goto failed; + } + } + /* verify record data */ + memcpy_fromio(buf, &dpram[addr + offset], len); + if (memcmp(buf, dat, len)) { + /* is not ok */ + dev_alert(&card->pdev->dev, "DPRAM readback failed\n"); + ret = -EIO; + goto failed; + } + } + if (!type_end) + /* no end record seen */ + goto failed; + ret = 0; +failed: + kfree(buf); + release_firmware(fw); + if (ret < 0) + dev_info(&card->pdev->dev, "firmware %s failed\n", file); + return ret; +} + +int softing_load_app_fw(const char *file, struct softing *card) +{ + const struct firmware *fw; + const uint8_t *mem, *end, *dat; + int ret, j; + uint16_t type, len; + uint32_t addr, start_addr = 0; + unsigned int sum, rx_sum; + int8_t type_end = 0, type_entrypoint = 0; + + ret = request_firmware(&fw, file, &card->pdev->dev); + if (ret) { + dev_alert(&card->pdev->dev, "request_firmware(%s) got %i\n", + file, ret); + return ret; + } + dev_dbg(&card->pdev->dev, "firmware(%s) got %lu bytes\n", + file, (unsigned long)fw->size); + /* parse the firmware */ + mem = fw->data; + end = &mem[fw->size]; + /* look for header record */ + ret = fw_parse(&mem, &type, &addr, &len, &dat); + if (ret) + goto failed; + ret = -EINVAL; + if (type != 0xffff) { + dev_alert(&card->pdev->dev, "firmware starts with type 0x%x\n", + type); + goto failed; + } + if (strncmp("Structured Binary Format, Softing GmbH", dat, len)) { + dev_alert(&card->pdev->dev, "firmware string '%.*s' fault\n", + len, dat); + goto failed; + } + /* ok, we had a header */ + while (mem < end) { + ret = fw_parse(&mem, &type, &addr, &len, &dat); + if (ret) + goto failed; + + if (type == 3) { + /* start address */ + start_addr = addr; + type_entrypoint = 1; + continue; + } else if (type == 1) { + /* eof */ + type_end = 1; + break; + } else if (type != 0) { + dev_alert(&card->pdev->dev, + "unknown record type 0x%04x\n", type); + ret = -EINVAL; + goto failed; + } + + /* regualar data */ + for (sum = 0, j = 0; j < len; ++j) + sum += dat[j]; + /* work in 16bit (target) */ + sum &= 0xffff; + + memcpy_toio(&card->dpram[card->pdat->app.offs], dat, len); + iowrite32(card->pdat->app.offs + card->pdat->app.addr, + &card->dpram[DPRAM_COMMAND + 2]); + iowrite32(addr, &card->dpram[DPRAM_COMMAND + 6]); + iowrite16(len, &card->dpram[DPRAM_COMMAND + 10]); + iowrite8(1, &card->dpram[DPRAM_COMMAND + 12]); + ret = softing_bootloader_command(card, 1, "loading app."); + if (ret < 0) + goto failed; + /* verify checksum */ + rx_sum = ioread16(&card->dpram[DPRAM_RECEIPT + 2]); + if (rx_sum != sum) { + dev_alert(&card->pdev->dev, "SRAM seems to be damaged" + ", wanted 0x%04x, got 0x%04x\n", sum, rx_sum); + ret = -EIO; + goto failed; + } + } + if (!type_end || !type_entrypoint) + goto failed; + /* start application in card */ + iowrite32(start_addr, &card->dpram[DPRAM_COMMAND + 2]); + iowrite8(1, &card->dpram[DPRAM_COMMAND + 6]); + ret = softing_bootloader_command(card, 3, "start app."); + if (ret < 0) + goto failed; + ret = 0; +failed: + release_firmware(fw); + if (ret < 0) + dev_info(&card->pdev->dev, "firmware %s failed\n", file); + return ret; +} + +static int softing_reset_chip(struct softing *card) +{ + int ret; + + do { + /* reset chip */ + iowrite8(0, &card->dpram[DPRAM_RESET_RX_FIFO]); + iowrite8(0, &card->dpram[DPRAM_RESET_RX_FIFO+1]); + iowrite8(1, &card->dpram[DPRAM_RESET]); + iowrite8(0, &card->dpram[DPRAM_RESET+1]); + + ret = softing_fct_cmd(card, 0, "reset_can"); + if (!ret) + break; + if (signal_pending(current)) + /* don't wait any longer */ + break; + } while (1); + card->tx.pending = 0; + return ret; +} + +int softing_chip_poweron(struct softing *card) +{ + int ret; + /* sync */ + ret = _softing_fct_cmd(card, 99, 0x55, "sync-a"); + if (ret < 0) + goto failed; + + ret = _softing_fct_cmd(card, 99, 0xaa, "sync-b"); + if (ret < 0) + goto failed; + + ret = softing_reset_chip(card); + if (ret < 0) + goto failed; + /* get_serial */ + ret = softing_fct_cmd(card, 43, "get_serial_number"); + if (ret < 0) + goto failed; + card->id.serial = ioread32(&card->dpram[DPRAM_FCT_PARAM]); + /* get_version */ + ret = softing_fct_cmd(card, 12, "get_version"); + if (ret < 0) + goto failed; + card->id.fw_version = ioread16(&card->dpram[DPRAM_FCT_PARAM + 2]); + card->id.hw_version = ioread16(&card->dpram[DPRAM_FCT_PARAM + 4]); + card->id.license = ioread16(&card->dpram[DPRAM_FCT_PARAM + 6]); + card->id.chip[0] = ioread16(&card->dpram[DPRAM_FCT_PARAM + 8]); + card->id.chip[1] = ioread16(&card->dpram[DPRAM_FCT_PARAM + 10]); + return 0; +failed: + return ret; +} + +static void softing_initialize_timestamp(struct softing *card) +{ + uint64_t ovf; + + card->ts_ref = ktime_get(); + + /* 16MHz is the reference */ + ovf = 0x100000000ULL * 16; + do_div(ovf, card->pdat->freq ?: 16); + + card->ts_overflow = ktime_add_us(ktime_set(0, 0), ovf); +} + +ktime_t softing_raw2ktime(struct softing *card, u32 raw) +{ + uint64_t rawl; + ktime_t now, real_offset; + ktime_t target; + ktime_t tmp; + + now = ktime_get(); + real_offset = ktime_sub(ktime_get_real(), now); + + /* find nsec from card */ + rawl = raw * 16; + do_div(rawl, card->pdat->freq ?: 16); + target = ktime_add_us(card->ts_ref, rawl); + /* test for overflows */ + tmp = ktime_add(target, card->ts_overflow); + while (unlikely(ktime_to_ns(tmp) > ktime_to_ns(now))) { + card->ts_ref = ktime_add(card->ts_ref, card->ts_overflow); + target = tmp; + tmp = ktime_add(target, card->ts_overflow); + } + return ktime_add(target, real_offset); +} + +static inline int softing_error_reporting(struct net_device *netdev) +{ + struct softing_priv *priv = netdev_priv(netdev); + + return (priv->can.ctrlmode & CAN_CTRLMODE_BERR_REPORTING) + ? 1 : 0; +} + +int softing_startstop(struct net_device *dev, int up) +{ + int ret; + struct softing *card; + struct softing_priv *priv; + struct net_device *netdev; + int bus_bitmask_start; + int j, error_reporting; + struct can_frame msg; + const struct can_bittiming *bt; + + priv = netdev_priv(dev); + card = priv->card; + + if (!card->fw.up) + return -EIO; + + ret = mutex_lock_interruptible(&card->fw.lock); + if (ret) + return ret; + + bus_bitmask_start = 0; + if (dev && up) + /* prepare to start this bus as well */ + bus_bitmask_start |= (1 << priv->index); + /* bring netdevs down */ + for (j = 0; j < ARRAY_SIZE(card->net); ++j) { + netdev = card->net[j]; + if (!netdev) + continue; + priv = netdev_priv(netdev); + + if (dev != netdev) + netif_stop_queue(netdev); + + if (netif_running(netdev)) { + if (dev != netdev) + bus_bitmask_start |= (1 << j); + priv->tx.pending = 0; + priv->tx.echo_put = 0; + priv->tx.echo_get = 0; + /* + * this bus' may just have called open_candev() + * which is rather stupid to call close_candev() + * already + * but we may come here from busoff recovery too + * in which case the echo_skb _needs_ flushing too. + * just be sure to call open_candev() again + */ + close_candev(netdev); + } + priv->can.state = CAN_STATE_STOPPED; + } + card->tx.pending = 0; + + softing_enable_irq(card, 0); + ret = softing_reset_chip(card); + if (ret) + goto failed; + if (!bus_bitmask_start) + /* no busses to be brought up */ + goto card_done; + + if ((bus_bitmask_start & 1) && (bus_bitmask_start & 2) + && (softing_error_reporting(card->net[0]) + != softing_error_reporting(card->net[1]))) { + dev_alert(&card->pdev->dev, + "err_reporting flag differs for busses\n"); + goto invalid; + } + error_reporting = 0; + if (bus_bitmask_start & 1) { + netdev = card->net[0]; + priv = netdev_priv(netdev); + error_reporting += softing_error_reporting(netdev); + /* init chip 1 */ + bt = &priv->can.bittiming; + iowrite16(bt->brp, &card->dpram[DPRAM_FCT_PARAM + 2]); + iowrite16(bt->sjw, &card->dpram[DPRAM_FCT_PARAM + 4]); + iowrite16(bt->phase_seg1 + bt->prop_seg, + &card->dpram[DPRAM_FCT_PARAM + 6]); + iowrite16(bt->phase_seg2, &card->dpram[DPRAM_FCT_PARAM + 8]); + iowrite16((priv->can.ctrlmode & CAN_CTRLMODE_3_SAMPLES) ? 1 : 0, + &card->dpram[DPRAM_FCT_PARAM + 10]); + ret = softing_fct_cmd(card, 1, "initialize_chip[0]"); + if (ret < 0) + goto failed; + /* set mode */ + iowrite16(0, &card->dpram[DPRAM_FCT_PARAM + 2]); + iowrite16(0, &card->dpram[DPRAM_FCT_PARAM + 4]); + ret = softing_fct_cmd(card, 3, "set_mode[0]"); + if (ret < 0) + goto failed; + /* set filter */ + /* 11bit id & mask */ + iowrite16(0x0000, &card->dpram[DPRAM_FCT_PARAM + 2]); + iowrite16(0x07ff, &card->dpram[DPRAM_FCT_PARAM + 4]); + /* 29bit id.lo & mask.lo & id.hi & mask.hi */ + iowrite16(0x0000, &card->dpram[DPRAM_FCT_PARAM + 6]); + iowrite16(0xffff, &card->dpram[DPRAM_FCT_PARAM + 8]); + iowrite16(0x0000, &card->dpram[DPRAM_FCT_PARAM + 10]); + iowrite16(0x1fff, &card->dpram[DPRAM_FCT_PARAM + 12]); + ret = softing_fct_cmd(card, 7, "set_filter[0]"); + if (ret < 0) + goto failed; + /* set output control */ + iowrite16(priv->output, &card->dpram[DPRAM_FCT_PARAM + 2]); + ret = softing_fct_cmd(card, 5, "set_output[0]"); + if (ret < 0) + goto failed; + } + if (bus_bitmask_start & 2) { + netdev = card->net[1]; + priv = netdev_priv(netdev); + error_reporting += softing_error_reporting(netdev); + /* init chip2 */ + bt = &priv->can.bittiming; + iowrite16(bt->brp, &card->dpram[DPRAM_FCT_PARAM + 2]); + iowrite16(bt->sjw, &card->dpram[DPRAM_FCT_PARAM + 4]); + iowrite16(bt->phase_seg1 + bt->prop_seg, + &card->dpram[DPRAM_FCT_PARAM + 6]); + iowrite16(bt->phase_seg2, &card->dpram[DPRAM_FCT_PARAM + 8]); + iowrite16((priv->can.ctrlmode & CAN_CTRLMODE_3_SAMPLES) ? 1 : 0, + &card->dpram[DPRAM_FCT_PARAM + 10]); + ret = softing_fct_cmd(card, 2, "initialize_chip[1]"); + if (ret < 0) + goto failed; + /* set mode2 */ + iowrite16(0, &card->dpram[DPRAM_FCT_PARAM + 2]); + iowrite16(0, &card->dpram[DPRAM_FCT_PARAM + 4]); + ret = softing_fct_cmd(card, 4, "set_mode[1]"); + if (ret < 0) + goto failed; + /* set filter2 */ + /* 11bit id & mask */ + iowrite16(0x0000, &card->dpram[DPRAM_FCT_PARAM + 2]); + iowrite16(0x07ff, &card->dpram[DPRAM_FCT_PARAM + 4]); + /* 29bit id.lo & mask.lo & id.hi & mask.hi */ + iowrite16(0x0000, &card->dpram[DPRAM_FCT_PARAM + 6]); + iowrite16(0xffff, &card->dpram[DPRAM_FCT_PARAM + 8]); + iowrite16(0x0000, &card->dpram[DPRAM_FCT_PARAM + 10]); + iowrite16(0x1fff, &card->dpram[DPRAM_FCT_PARAM + 12]); + ret = softing_fct_cmd(card, 8, "set_filter[1]"); + if (ret < 0) + goto failed; + /* set output control2 */ + iowrite16(priv->output, &card->dpram[DPRAM_FCT_PARAM + 2]); + ret = softing_fct_cmd(card, 6, "set_output[1]"); + if (ret < 0) + goto failed; + } + /* enable_error_frame */ + /* + * Error reporting is switched off at the moment since + * the receiving of them is not yet 100% verified + * This should be enabled sooner or later + * + if (error_reporting) { + ret = softing_fct_cmd(card, 51, "enable_error_frame"); + if (ret < 0) + goto failed; + } + */ + /* initialize interface */ + iowrite16(1, &card->dpram[DPRAM_FCT_PARAM + 2]); + iowrite16(1, &card->dpram[DPRAM_FCT_PARAM + 4]); + iowrite16(1, &card->dpram[DPRAM_FCT_PARAM + 6]); + iowrite16(1, &card->dpram[DPRAM_FCT_PARAM + 8]); + iowrite16(1, &card->dpram[DPRAM_FCT_PARAM + 10]); + iowrite16(1, &card->dpram[DPRAM_FCT_PARAM + 12]); + iowrite16(1, &card->dpram[DPRAM_FCT_PARAM + 14]); + iowrite16(1, &card->dpram[DPRAM_FCT_PARAM + 16]); + iowrite16(1, &card->dpram[DPRAM_FCT_PARAM + 18]); + iowrite16(1, &card->dpram[DPRAM_FCT_PARAM + 20]); + ret = softing_fct_cmd(card, 17, "initialize_interface"); + if (ret < 0) + goto failed; + /* enable_fifo */ + ret = softing_fct_cmd(card, 36, "enable_fifo"); + if (ret < 0) + goto failed; + /* enable fifo tx ack */ + ret = softing_fct_cmd(card, 13, "fifo_tx_ack[0]"); + if (ret < 0) + goto failed; + /* enable fifo tx ack2 */ + ret = softing_fct_cmd(card, 14, "fifo_tx_ack[1]"); + if (ret < 0) + goto failed; + /* start_chip */ + ret = softing_fct_cmd(card, 11, "start_chip"); + if (ret < 0) + goto failed; + iowrite8(0, &card->dpram[DPRAM_INFO_BUSSTATE]); + iowrite8(0, &card->dpram[DPRAM_INFO_BUSSTATE2]); + if (card->pdat->generation < 2) { + iowrite8(0, &card->dpram[DPRAM_V2_IRQ_TOHOST]); + /* flush the DPRAM caches */ + wmb(); + } + + softing_initialize_timestamp(card); + + /* + * do socketcan notifications/status changes + * from here, no errors should occur, or the failed: part + * must be reviewed + */ + memset(&msg, 0, sizeof(msg)); + msg.can_id = CAN_ERR_FLAG | CAN_ERR_RESTARTED; + msg.can_dlc = CAN_ERR_DLC; + for (j = 0; j < ARRAY_SIZE(card->net); ++j) { + if (!(bus_bitmask_start & (1 << j))) + continue; + netdev = card->net[j]; + if (!netdev) + continue; + priv = netdev_priv(netdev); + priv->can.state = CAN_STATE_ERROR_ACTIVE; + open_candev(netdev); + if (dev != netdev) { + /* notify other busses on the restart */ + softing_netdev_rx(netdev, &msg, ktime_set(0, 0)); + ++priv->can.can_stats.restarts; + } + netif_wake_queue(netdev); + } + + /* enable interrupts */ + ret = softing_enable_irq(card, 1); + if (ret) + goto failed; +card_done: + mutex_unlock(&card->fw.lock); + return 0; +invalid: + ret = -EINVAL; +failed: + softing_enable_irq(card, 0); + softing_reset_chip(card); + mutex_unlock(&card->fw.lock); + /* bring all other interfaces down */ + for (j = 0; j < ARRAY_SIZE(card->net); ++j) { + netdev = card->net[j]; + if (!netdev) + continue; + dev_close(netdev); + } + return ret; +} + +int softing_default_output(struct net_device *netdev) +{ + struct softing_priv *priv = netdev_priv(netdev); + struct softing *card = priv->card; + + switch (priv->chip) { + case 1000: + return (card->pdat->generation < 2) ? 0xfb : 0xfa; + case 5: + return 0x60; + default: + return 0x40; + } +} diff --git a/drivers/net/can/softing/softing_main.c b/drivers/net/can/softing/softing_main.c new file mode 100644 index 00000000000..5157e15e96e --- /dev/null +++ b/drivers/net/can/softing/softing_main.c @@ -0,0 +1,893 @@ +/* + * Copyright (C) 2008-2010 + * + * - Kurt Van Dijck, EIA Electronics + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the version 2 of the GNU General Public License + * as published by the Free Software Foundation + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/version.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/interrupt.h> + +#include "softing.h" + +#define TX_ECHO_SKB_MAX (((TXMAX+1)/2)-1) + +/* + * test is a specific CAN netdev + * is online (ie. up 'n running, not sleeping, not busoff + */ +static inline int canif_is_active(struct net_device *netdev) +{ + struct can_priv *can = netdev_priv(netdev); + + if (!netif_running(netdev)) + return 0; + return (can->state <= CAN_STATE_ERROR_PASSIVE); +} + +/* reset DPRAM */ +static inline void softing_set_reset_dpram(struct softing *card) +{ + if (card->pdat->generation >= 2) { + spin_lock_bh(&card->spin); + iowrite8(ioread8(&card->dpram[DPRAM_V2_RESET]) & ~1, + &card->dpram[DPRAM_V2_RESET]); + spin_unlock_bh(&card->spin); + } +} + +static inline void softing_clr_reset_dpram(struct softing *card) +{ + if (card->pdat->generation >= 2) { + spin_lock_bh(&card->spin); + iowrite8(ioread8(&card->dpram[DPRAM_V2_RESET]) | 1, + &card->dpram[DPRAM_V2_RESET]); + spin_unlock_bh(&card->spin); + } +} + +/* trigger the tx queue-ing */ +static netdev_tx_t softing_netdev_start_xmit(struct sk_buff *skb, + struct net_device *dev) +{ + struct softing_priv *priv = netdev_priv(dev); + struct softing *card = priv->card; + int ret; + uint8_t *ptr; + uint8_t fifo_wr, fifo_rd; + struct can_frame *cf = (struct can_frame *)skb->data; + uint8_t buf[DPRAM_TX_SIZE]; + + if (can_dropped_invalid_skb(dev, skb)) + return NETDEV_TX_OK; + + spin_lock(&card->spin); + + ret = NETDEV_TX_BUSY; + if (!card->fw.up || + (card->tx.pending >= TXMAX) || + (priv->tx.pending >= TX_ECHO_SKB_MAX)) + goto xmit_done; + fifo_wr = ioread8(&card->dpram[DPRAM_TX_WR]); + fifo_rd = ioread8(&card->dpram[DPRAM_TX_RD]); + if (fifo_wr == fifo_rd) + /* fifo full */ + goto xmit_done; + memset(buf, 0, sizeof(buf)); + ptr = buf; + *ptr = CMD_TX; + if (cf->can_id & CAN_RTR_FLAG) + *ptr |= CMD_RTR; + if (cf->can_id & CAN_EFF_FLAG) + *ptr |= CMD_XTD; + if (priv->index) + *ptr |= CMD_BUS2; + ++ptr; + *ptr++ = cf->can_dlc; + *ptr++ = (cf->can_id >> 0); + *ptr++ = (cf->can_id >> 8); + if (cf->can_id & CAN_EFF_FLAG) { + *ptr++ = (cf->can_id >> 16); + *ptr++ = (cf->can_id >> 24); + } else { + /* increment 1, not 2 as you might think */ + ptr += 1; + } + if (!(cf->can_id & CAN_RTR_FLAG)) + memcpy(ptr, &cf->data[0], cf->can_dlc); + memcpy_toio(&card->dpram[DPRAM_TX + DPRAM_TX_SIZE * fifo_wr], + buf, DPRAM_TX_SIZE); + if (++fifo_wr >= DPRAM_TX_CNT) + fifo_wr = 0; + iowrite8(fifo_wr, &card->dpram[DPRAM_TX_WR]); + card->tx.last_bus = priv->index; + ++card->tx.pending; + ++priv->tx.pending; + can_put_echo_skb(skb, dev, priv->tx.echo_put); + ++priv->tx.echo_put; + if (priv->tx.echo_put >= TX_ECHO_SKB_MAX) + priv->tx.echo_put = 0; + /* can_put_echo_skb() saves the skb, safe to return TX_OK */ + ret = NETDEV_TX_OK; +xmit_done: + spin_unlock(&card->spin); + if (card->tx.pending >= TXMAX) { + int j; + for (j = 0; j < ARRAY_SIZE(card->net); ++j) { + if (card->net[j]) + netif_stop_queue(card->net[j]); + } + } + if (ret != NETDEV_TX_OK) + netif_stop_queue(dev); + + return ret; +} + +/* + * shortcut for skb delivery + */ +int softing_netdev_rx(struct net_device *netdev, const struct can_frame *msg, + ktime_t ktime) +{ + struct sk_buff *skb; + struct can_frame *cf; + + skb = alloc_can_skb(netdev, &cf); + if (!skb) + return -ENOMEM; + memcpy(cf, msg, sizeof(*msg)); + skb->tstamp = ktime; + return netif_rx(skb); +} + +/* + * softing_handle_1 + * pop 1 entry from the DPRAM queue, and process + */ +static int softing_handle_1(struct softing *card) +{ + struct net_device *netdev; + struct softing_priv *priv; + ktime_t ktime; + struct can_frame msg; + int cnt = 0, lost_msg; + uint8_t fifo_rd, fifo_wr, cmd; + uint8_t *ptr; + uint32_t tmp_u32; + uint8_t buf[DPRAM_RX_SIZE]; + + memset(&msg, 0, sizeof(msg)); + /* test for lost msgs */ + lost_msg = ioread8(&card->dpram[DPRAM_RX_LOST]); + if (lost_msg) { + int j; + /* reset condition */ + iowrite8(0, &card->dpram[DPRAM_RX_LOST]); + /* prepare msg */ + msg.can_id = CAN_ERR_FLAG | CAN_ERR_CRTL; + msg.can_dlc = CAN_ERR_DLC; + msg.data[1] = CAN_ERR_CRTL_RX_OVERFLOW; + /* + * service to all busses, we don't know which it was applicable + * but only service busses that are online + */ + for (j = 0; j < ARRAY_SIZE(card->net); ++j) { + netdev = card->net[j]; + if (!netdev) + continue; + if (!canif_is_active(netdev)) + /* a dead bus has no overflows */ + continue; + ++netdev->stats.rx_over_errors; + softing_netdev_rx(netdev, &msg, ktime_set(0, 0)); + } + /* prepare for other use */ + memset(&msg, 0, sizeof(msg)); + ++cnt; + } + + fifo_rd = ioread8(&card->dpram[DPRAM_RX_RD]); + fifo_wr = ioread8(&card->dpram[DPRAM_RX_WR]); + + if (++fifo_rd >= DPRAM_RX_CNT) + fifo_rd = 0; + if (fifo_wr == fifo_rd) + return cnt; + + memcpy_fromio(buf, &card->dpram[DPRAM_RX + DPRAM_RX_SIZE*fifo_rd], + DPRAM_RX_SIZE); + mb(); + /* trigger dual port RAM */ + iowrite8(fifo_rd, &card->dpram[DPRAM_RX_RD]); + + ptr = buf; + cmd = *ptr++; + if (cmd == 0xff) + /* not quite usefull, probably the card has got out */ + return 0; + netdev = card->net[0]; + if (cmd & CMD_BUS2) + netdev = card->net[1]; + priv = netdev_priv(netdev); + + if (cmd & CMD_ERR) { + uint8_t can_state, state; + + state = *ptr++; + + msg.can_id = CAN_ERR_FLAG; + msg.can_dlc = CAN_ERR_DLC; + + if (state & SF_MASK_BUSOFF) { + can_state = CAN_STATE_BUS_OFF; + msg.can_id |= CAN_ERR_BUSOFF; + state = STATE_BUSOFF; + } else if (state & SF_MASK_EPASSIVE) { + can_state = CAN_STATE_ERROR_PASSIVE; + msg.can_id |= CAN_ERR_CRTL; + msg.data[1] = CAN_ERR_CRTL_TX_PASSIVE; + state = STATE_EPASSIVE; + } else { + can_state = CAN_STATE_ERROR_ACTIVE; + msg.can_id |= CAN_ERR_CRTL; + state = STATE_EACTIVE; + } + /* update DPRAM */ + iowrite8(state, &card->dpram[priv->index ? + DPRAM_INFO_BUSSTATE2 : DPRAM_INFO_BUSSTATE]); + /* timestamp */ + tmp_u32 = le32_to_cpup((void *)ptr); + ptr += 4; + ktime = softing_raw2ktime(card, tmp_u32); + + ++netdev->stats.rx_errors; + /* update internal status */ + if (can_state != priv->can.state) { + priv->can.state = can_state; + if (can_state == CAN_STATE_ERROR_PASSIVE) + ++priv->can.can_stats.error_passive; + else if (can_state == CAN_STATE_BUS_OFF) { + /* this calls can_close_cleanup() */ + can_bus_off(netdev); + netif_stop_queue(netdev); + } + /* trigger socketcan */ + softing_netdev_rx(netdev, &msg, ktime); + } + + } else { + if (cmd & CMD_RTR) + msg.can_id |= CAN_RTR_FLAG; + msg.can_dlc = get_can_dlc(*ptr++); + if (cmd & CMD_XTD) { + msg.can_id |= CAN_EFF_FLAG; + msg.can_id |= le32_to_cpup((void *)ptr); + ptr += 4; + } else { + msg.can_id |= le16_to_cpup((void *)ptr); + ptr += 2; + } + /* timestamp */ + tmp_u32 = le32_to_cpup((void *)ptr); + ptr += 4; + ktime = softing_raw2ktime(card, tmp_u32); + if (!(msg.can_id & CAN_RTR_FLAG)) + memcpy(&msg.data[0], ptr, 8); + ptr += 8; + /* update socket */ + if (cmd & CMD_ACK) { + /* acknowledge, was tx msg */ + struct sk_buff *skb; + skb = priv->can.echo_skb[priv->tx.echo_get]; + if (skb) + skb->tstamp = ktime; + can_get_echo_skb(netdev, priv->tx.echo_get); + ++priv->tx.echo_get; + if (priv->tx.echo_get >= TX_ECHO_SKB_MAX) + priv->tx.echo_get = 0; + if (priv->tx.pending) + --priv->tx.pending; + if (card->tx.pending) + --card->tx.pending; + ++netdev->stats.tx_packets; + if (!(msg.can_id & CAN_RTR_FLAG)) + netdev->stats.tx_bytes += msg.can_dlc; + } else { + int ret; + + ret = softing_netdev_rx(netdev, &msg, ktime); + if (ret == NET_RX_SUCCESS) { + ++netdev->stats.rx_packets; + if (!(msg.can_id & CAN_RTR_FLAG)) + netdev->stats.rx_bytes += msg.can_dlc; + } else { + ++netdev->stats.rx_dropped; + } + } + } + ++cnt; + return cnt; +} + +/* + * real interrupt handler + */ +static irqreturn_t softing_irq_thread(int irq, void *dev_id) +{ + struct softing *card = (struct softing *)dev_id; + struct net_device *netdev; + struct softing_priv *priv; + int j, offset, work_done; + + work_done = 0; + spin_lock_bh(&card->spin); + while (softing_handle_1(card) > 0) { + ++card->irq.svc_count; + ++work_done; + } + spin_unlock_bh(&card->spin); + /* resume tx queue's */ + offset = card->tx.last_bus; + for (j = 0; j < ARRAY_SIZE(card->net); ++j) { + if (card->tx.pending >= TXMAX) + break; + netdev = card->net[(j + offset + 1) % card->pdat->nbus]; + if (!netdev) + continue; + priv = netdev_priv(netdev); + if (!canif_is_active(netdev)) + /* it makes no sense to wake dead busses */ + continue; + if (priv->tx.pending >= TX_ECHO_SKB_MAX) + continue; + ++work_done; + netif_wake_queue(netdev); + } + return work_done ? IRQ_HANDLED : IRQ_NONE; +} + +/* + * interrupt routines: + * schedule the 'real interrupt handler' + */ +static irqreturn_t softing_irq_v2(int irq, void *dev_id) +{ + struct softing *card = (struct softing *)dev_id; + uint8_t ir; + + ir = ioread8(&card->dpram[DPRAM_V2_IRQ_TOHOST]); + iowrite8(0, &card->dpram[DPRAM_V2_IRQ_TOHOST]); + return (1 == ir) ? IRQ_WAKE_THREAD : IRQ_NONE; +} + +static irqreturn_t softing_irq_v1(int irq, void *dev_id) +{ + struct softing *card = (struct softing *)dev_id; + uint8_t ir; + + ir = ioread8(&card->dpram[DPRAM_IRQ_TOHOST]); + iowrite8(0, &card->dpram[DPRAM_IRQ_TOHOST]); + return ir ? IRQ_WAKE_THREAD : IRQ_NONE; +} + +/* + * netdev/candev inter-operability + */ +static int softing_netdev_open(struct net_device *ndev) +{ + int ret; + + /* check or determine and set bittime */ + ret = open_candev(ndev); + if (!ret) + ret = softing_startstop(ndev, 1); + return ret; +} + +static int softing_netdev_stop(struct net_device *ndev) +{ + int ret; + + netif_stop_queue(ndev); + + /* softing cycle does close_candev() */ + ret = softing_startstop(ndev, 0); + return ret; +} + +static int softing_candev_set_mode(struct net_device *ndev, enum can_mode mode) +{ + int ret; + + switch (mode) { + case CAN_MODE_START: + /* softing_startstop does close_candev() */ + ret = softing_startstop(ndev, 1); + return ret; + case CAN_MODE_STOP: + case CAN_MODE_SLEEP: + return -EOPNOTSUPP; + } + return 0; +} + +/* + * Softing device management helpers + */ +int softing_enable_irq(struct softing *card, int enable) +{ + int ret; + + if (!card->irq.nr) { + return 0; + } else if (card->irq.requested && !enable) { + free_irq(card->irq.nr, card); + card->irq.requested = 0; + } else if (!card->irq.requested && enable) { + ret = request_threaded_irq(card->irq.nr, + (card->pdat->generation >= 2) ? + softing_irq_v2 : softing_irq_v1, + softing_irq_thread, IRQF_SHARED, + dev_name(&card->pdev->dev), card); + if (ret) { + dev_alert(&card->pdev->dev, + "request_threaded_irq(%u) failed\n", + card->irq.nr); + return ret; + } + card->irq.requested = 1; + } + return 0; +} + +static void softing_card_shutdown(struct softing *card) +{ + int fw_up = 0; + + if (mutex_lock_interruptible(&card->fw.lock)) + /* return -ERESTARTSYS */; + fw_up = card->fw.up; + card->fw.up = 0; + + if (card->irq.requested && card->irq.nr) { + free_irq(card->irq.nr, card); + card->irq.requested = 0; + } + if (fw_up) { + if (card->pdat->enable_irq) + card->pdat->enable_irq(card->pdev, 0); + softing_set_reset_dpram(card); + if (card->pdat->reset) + card->pdat->reset(card->pdev, 1); + } + mutex_unlock(&card->fw.lock); +} + +static __devinit int softing_card_boot(struct softing *card) +{ + int ret, j; + static const uint8_t stream[] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, }; + unsigned char back[sizeof(stream)]; + + if (mutex_lock_interruptible(&card->fw.lock)) + return -ERESTARTSYS; + if (card->fw.up) { + mutex_unlock(&card->fw.lock); + return 0; + } + /* reset board */ + if (card->pdat->enable_irq) + card->pdat->enable_irq(card->pdev, 1); + /* boot card */ + softing_set_reset_dpram(card); + if (card->pdat->reset) + card->pdat->reset(card->pdev, 1); + for (j = 0; (j + sizeof(stream)) < card->dpram_size; + j += sizeof(stream)) { + + memcpy_toio(&card->dpram[j], stream, sizeof(stream)); + /* flush IO cache */ + mb(); + memcpy_fromio(back, &card->dpram[j], sizeof(stream)); + + if (!memcmp(back, stream, sizeof(stream))) + continue; + /* memory is not equal */ + dev_alert(&card->pdev->dev, "dpram failed at 0x%04x\n", j); + ret = -EIO; + goto failed; + } + wmb(); + /* load boot firmware */ + ret = softing_load_fw(card->pdat->boot.fw, card, card->dpram, + card->dpram_size, + card->pdat->boot.offs - card->pdat->boot.addr); + if (ret < 0) + goto failed; + /* load loader firmware */ + ret = softing_load_fw(card->pdat->load.fw, card, card->dpram, + card->dpram_size, + card->pdat->load.offs - card->pdat->load.addr); + if (ret < 0) + goto failed; + + if (card->pdat->reset) + card->pdat->reset(card->pdev, 0); + softing_clr_reset_dpram(card); + ret = softing_bootloader_command(card, 0, "card boot"); + if (ret < 0) + goto failed; + ret = softing_load_app_fw(card->pdat->app.fw, card); + if (ret < 0) + goto failed; + + ret = softing_chip_poweron(card); + if (ret < 0) + goto failed; + + card->fw.up = 1; + mutex_unlock(&card->fw.lock); + return 0; +failed: + card->fw.up = 0; + if (card->pdat->enable_irq) + card->pdat->enable_irq(card->pdev, 0); + softing_set_reset_dpram(card); + if (card->pdat->reset) + card->pdat->reset(card->pdev, 1); + mutex_unlock(&card->fw.lock); + return ret; +} + +/* + * netdev sysfs + */ +static ssize_t show_channel(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct net_device *ndev = to_net_dev(dev); + struct softing_priv *priv = netdev2softing(ndev); + + return sprintf(buf, "%i\n", priv->index); +} + +static ssize_t show_chip(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct net_device *ndev = to_net_dev(dev); + struct softing_priv *priv = netdev2softing(ndev); + + return sprintf(buf, "%i\n", priv->chip); +} + +static ssize_t show_output(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct net_device *ndev = to_net_dev(dev); + struct softing_priv *priv = netdev2softing(ndev); + + return sprintf(buf, "0x%02x\n", priv->output); +} + +static ssize_t store_output(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct net_device *ndev = to_net_dev(dev); + struct softing_priv *priv = netdev2softing(ndev); + struct softing *card = priv->card; + unsigned long val; + int ret; + + ret = strict_strtoul(buf, 0, &val); + if (ret < 0) + return ret; + val &= 0xFF; + + ret = mutex_lock_interruptible(&card->fw.lock); + if (ret) + return -ERESTARTSYS; + if (netif_running(ndev)) { + mutex_unlock(&card->fw.lock); + return -EBUSY; + } + priv->output = val; + mutex_unlock(&card->fw.lock); + return count; +} + +static const DEVICE_ATTR(channel, S_IRUGO, show_channel, NULL); +static const DEVICE_ATTR(chip, S_IRUGO, show_chip, NULL); +static const DEVICE_ATTR(output, S_IRUGO | S_IWUSR, show_output, store_output); + +static const struct attribute *const netdev_sysfs_attrs[] = { + &dev_attr_channel.attr, + &dev_attr_chip.attr, + &dev_attr_output.attr, + NULL, +}; +static const struct attribute_group netdev_sysfs_group = { + .name = NULL, + .attrs = (struct attribute **)netdev_sysfs_attrs, +}; + +static const struct net_device_ops softing_netdev_ops = { + .ndo_open = softing_netdev_open, + .ndo_stop = softing_netdev_stop, + .ndo_start_xmit = softing_netdev_start_xmit, +}; + +static const struct can_bittiming_const softing_btr_const = { + .tseg1_min = 1, + .tseg1_max = 16, + .tseg2_min = 1, + .tseg2_max = 8, + .sjw_max = 4, /* overruled */ + .brp_min = 1, + .brp_max = 32, /* overruled */ + .brp_inc = 1, +}; + + +static __devinit struct net_device *softing_netdev_create(struct softing *card, + uint16_t chip_id) +{ + struct net_device *netdev; + struct softing_priv *priv; + + netdev = alloc_candev(sizeof(*priv), TX_ECHO_SKB_MAX); + if (!netdev) { + dev_alert(&card->pdev->dev, "alloc_candev failed\n"); + return NULL; + } + priv = netdev_priv(netdev); + priv->netdev = netdev; + priv->card = card; + memcpy(&priv->btr_const, &softing_btr_const, sizeof(priv->btr_const)); + priv->btr_const.brp_max = card->pdat->max_brp; + priv->btr_const.sjw_max = card->pdat->max_sjw; + priv->can.bittiming_const = &priv->btr_const; + priv->can.clock.freq = 8000000; + priv->chip = chip_id; + priv->output = softing_default_output(netdev); + SET_NETDEV_DEV(netdev, &card->pdev->dev); + + netdev->flags |= IFF_ECHO; + netdev->netdev_ops = &softing_netdev_ops; + priv->can.do_set_mode = softing_candev_set_mode; + priv->can.ctrlmode_supported = CAN_CTRLMODE_3_SAMPLES; + + return netdev; +} + +static __devinit int softing_netdev_register(struct net_device *netdev) +{ + int ret; + + netdev->sysfs_groups[0] = &netdev_sysfs_group; + ret = register_candev(netdev); + if (ret) { + dev_alert(&netdev->dev, "register failed\n"); + return ret; + } + return 0; +} + +static void softing_netdev_cleanup(struct net_device *netdev) +{ + unregister_candev(netdev); + free_candev(netdev); +} + +/* + * sysfs for Platform device + */ +#define DEV_ATTR_RO(name, member) \ +static ssize_t show_##name(struct device *dev, \ + struct device_attribute *attr, char *buf) \ +{ \ + struct softing *card = platform_get_drvdata(to_platform_device(dev)); \ + return sprintf(buf, "%u\n", card->member); \ +} \ +static DEVICE_ATTR(name, 0444, show_##name, NULL) + +#define DEV_ATTR_RO_STR(name, member) \ +static ssize_t show_##name(struct device *dev, \ + struct device_attribute *attr, char *buf) \ +{ \ + struct softing *card = platform_get_drvdata(to_platform_device(dev)); \ + return sprintf(buf, "%s\n", card->member); \ +} \ +static DEVICE_ATTR(name, 0444, show_##name, NULL) + +DEV_ATTR_RO(serial, id.serial); +DEV_ATTR_RO_STR(firmware, pdat->app.fw); +DEV_ATTR_RO(firmware_version, id.fw_version); +DEV_ATTR_RO_STR(hardware, pdat->name); +DEV_ATTR_RO(hardware_version, id.hw_version); +DEV_ATTR_RO(license, id.license); +DEV_ATTR_RO(frequency, id.freq); +DEV_ATTR_RO(txpending, tx.pending); + +static struct attribute *softing_pdev_attrs[] = { + &dev_attr_serial.attr, + &dev_attr_firmware.attr, + &dev_attr_firmware_version.attr, + &dev_attr_hardware.attr, + &dev_attr_hardware_version.attr, + &dev_attr_license.attr, + &dev_attr_frequency.attr, + &dev_attr_txpending.attr, + NULL, +}; + +static const struct attribute_group softing_pdev_group = { + .name = NULL, + .attrs = softing_pdev_attrs, +}; + +/* + * platform driver + */ +static __devexit int softing_pdev_remove(struct platform_device *pdev) +{ + struct softing *card = platform_get_drvdata(pdev); + int j; + + /* first, disable card*/ + softing_card_shutdown(card); + + for (j = 0; j < ARRAY_SIZE(card->net); ++j) { + if (!card->net[j]) + continue; + softing_netdev_cleanup(card->net[j]); + card->net[j] = NULL; + } + sysfs_remove_group(&pdev->dev.kobj, &softing_pdev_group); + + iounmap(card->dpram); + kfree(card); + return 0; +} + +static __devinit int softing_pdev_probe(struct platform_device *pdev) +{ + const struct softing_platform_data *pdat = pdev->dev.platform_data; + struct softing *card; + struct net_device *netdev; + struct softing_priv *priv; + struct resource *pres; + int ret; + int j; + + if (!pdat) { + dev_warn(&pdev->dev, "no platform data\n"); + return -EINVAL; + } + if (pdat->nbus > ARRAY_SIZE(card->net)) { + dev_warn(&pdev->dev, "%u nets??\n", pdat->nbus); + return -EINVAL; + } + + card = kzalloc(sizeof(*card), GFP_KERNEL); + if (!card) + return -ENOMEM; + card->pdat = pdat; + card->pdev = pdev; + platform_set_drvdata(pdev, card); + mutex_init(&card->fw.lock); + spin_lock_init(&card->spin); + + ret = -EINVAL; + pres = platform_get_resource(pdev, IORESOURCE_MEM, 0); + if (!pres) + goto platform_resource_failed;; + card->dpram_phys = pres->start; + card->dpram_size = pres->end - pres->start + 1; + card->dpram = ioremap_nocache(card->dpram_phys, card->dpram_size); + if (!card->dpram) { + dev_alert(&card->pdev->dev, "dpram ioremap failed\n"); + goto ioremap_failed; + } + + pres = platform_get_resource(pdev, IORESOURCE_IRQ, 0); + if (pres) + card->irq.nr = pres->start; + + /* reset card */ + ret = softing_card_boot(card); + if (ret < 0) { + dev_alert(&pdev->dev, "failed to boot\n"); + goto boot_failed; + } + + /* only now, the chip's are known */ + card->id.freq = card->pdat->freq; + + ret = sysfs_create_group(&pdev->dev.kobj, &softing_pdev_group); + if (ret < 0) { + dev_alert(&card->pdev->dev, "sysfs failed\n"); + goto sysfs_failed; + } + + ret = -ENOMEM; + for (j = 0; j < ARRAY_SIZE(card->net); ++j) { + card->net[j] = netdev = + softing_netdev_create(card, card->id.chip[j]); + if (!netdev) { + dev_alert(&pdev->dev, "failed to make can[%i]", j); + goto netdev_failed; + } + priv = netdev_priv(card->net[j]); + priv->index = j; + ret = softing_netdev_register(netdev); + if (ret) { + free_candev(netdev); + card->net[j] = NULL; + dev_alert(&card->pdev->dev, + "failed to register can[%i]\n", j); + goto netdev_failed; + } + } + dev_info(&card->pdev->dev, "%s ready.\n", card->pdat->name); + return 0; + +netdev_failed: + for (j = 0; j < ARRAY_SIZE(card->net); ++j) { + if (!card->net[j]) + continue; + softing_netdev_cleanup(card->net[j]); + } + sysfs_remove_group(&pdev->dev.kobj, &softing_pdev_group); +sysfs_failed: + softing_card_shutdown(card); +boot_failed: + iounmap(card->dpram); +ioremap_failed: +platform_resource_failed: + kfree(card); + return ret; +} + +static struct platform_driver softing_driver = { + .driver = { + .name = "softing", + .owner = THIS_MODULE, + }, + .probe = softing_pdev_probe, + .remove = __devexit_p(softing_pdev_remove), +}; + +MODULE_ALIAS("platform:softing"); + +static int __init softing_start(void) +{ + return platform_driver_register(&softing_driver); +} + +static void __exit softing_stop(void) +{ + platform_driver_unregister(&softing_driver); +} + +module_init(softing_start); +module_exit(softing_stop); + +MODULE_DESCRIPTION("Softing DPRAM CAN driver"); +MODULE_AUTHOR("Kurt Van Dijck <kurt.van.dijck@eia.be>"); +MODULE_LICENSE("GPL v2"); diff --git a/drivers/net/can/softing/softing_platform.h b/drivers/net/can/softing/softing_platform.h new file mode 100644 index 00000000000..ebbf6981562 --- /dev/null +++ b/drivers/net/can/softing/softing_platform.h @@ -0,0 +1,40 @@ + +#include <linux/platform_device.h> + +#ifndef _SOFTING_DEVICE_H_ +#define _SOFTING_DEVICE_H_ + +/* softing firmware directory prefix */ +#define fw_dir "softing-4.6/" + +struct softing_platform_data { + unsigned int manf; + unsigned int prod; + /* + * generation + * 1st with NEC or SJA1000 + * 8bit, exclusive interrupt, ... + * 2nd only SJA1000 + * 16bit, shared interrupt + */ + int generation; + int nbus; /* # busses on device */ + unsigned int freq; /* operating frequency in Hz */ + unsigned int max_brp; + unsigned int max_sjw; + unsigned long dpram_size; + const char *name; + struct { + unsigned long offs; + unsigned long addr; + const char *fw; + } boot, load, app; + /* + * reset() function + * bring pdev in or out of reset, depending on value + */ + int (*reset)(struct platform_device *pdev, int value); + int (*enable_irq)(struct platform_device *pdev, int value); +}; + +#endif diff --git a/drivers/net/cxgb4/cxgb4_main.c b/drivers/net/cxgb4/cxgb4_main.c index 059c1eec8c3..ec35d458102 100644 --- a/drivers/net/cxgb4/cxgb4_main.c +++ b/drivers/net/cxgb4/cxgb4_main.c @@ -2710,6 +2710,8 @@ static int cxgb_open(struct net_device *dev) struct port_info *pi = netdev_priv(dev); struct adapter *adapter = pi->adapter; + netif_carrier_off(dev); + if (!(adapter->flags & FULL_INIT_DONE)) { err = cxgb_up(adapter); if (err < 0) @@ -3661,7 +3663,6 @@ static int __devinit init_one(struct pci_dev *pdev, pi->xact_addr_filt = -1; pi->rx_offload = RX_CSO; pi->port_id = i; - netif_carrier_off(netdev); netdev->irq = pdev->irq; netdev->features |= NETIF_F_SG | TSO_FLAGS; diff --git a/drivers/net/e1000e/e1000.h b/drivers/net/e1000e/e1000.h index e610e136905..00bf595ebd6 100644 --- a/drivers/net/e1000e/e1000.h +++ b/drivers/net/e1000e/e1000.h @@ -364,6 +364,7 @@ struct e1000_adapter { /* structs defined in e1000_hw.h */ struct e1000_hw hw; + spinlock_t stats64_lock; struct e1000_hw_stats stats; struct e1000_phy_info phy_info; struct e1000_phy_stats phy_stats; @@ -494,7 +495,9 @@ extern int e1000e_setup_rx_resources(struct e1000_adapter *adapter); extern int e1000e_setup_tx_resources(struct e1000_adapter *adapter); extern void e1000e_free_rx_resources(struct e1000_adapter *adapter); extern void e1000e_free_tx_resources(struct e1000_adapter *adapter); -extern void e1000e_update_stats(struct e1000_adapter *adapter); +extern struct rtnl_link_stats64 *e1000e_get_stats64(struct net_device *netdev, + struct rtnl_link_stats64 + *stats); extern void e1000e_set_interrupt_capability(struct e1000_adapter *adapter); extern void e1000e_reset_interrupt_capability(struct e1000_adapter *adapter); extern void e1000e_get_hw_control(struct e1000_adapter *adapter); diff --git a/drivers/net/e1000e/ethtool.c b/drivers/net/e1000e/ethtool.c index fa08b6336cf..daa7fe4b9fd 100644 --- a/drivers/net/e1000e/ethtool.c +++ b/drivers/net/e1000e/ethtool.c @@ -46,15 +46,15 @@ struct e1000_stats { }; #define E1000_STAT(str, m) { \ - .stat_string = str, \ - .type = E1000_STATS, \ - .sizeof_stat = sizeof(((struct e1000_adapter *)0)->m), \ - .stat_offset = offsetof(struct e1000_adapter, m) } + .stat_string = str, \ + .type = E1000_STATS, \ + .sizeof_stat = sizeof(((struct e1000_adapter *)0)->m), \ + .stat_offset = offsetof(struct e1000_adapter, m) } #define E1000_NETDEV_STAT(str, m) { \ - .stat_string = str, \ - .type = NETDEV_STATS, \ - .sizeof_stat = sizeof(((struct net_device *)0)->m), \ - .stat_offset = offsetof(struct net_device, m) } + .stat_string = str, \ + .type = NETDEV_STATS, \ + .sizeof_stat = sizeof(((struct rtnl_link_stats64 *)0)->m), \ + .stat_offset = offsetof(struct rtnl_link_stats64, m) } static const struct e1000_stats e1000_gstrings_stats[] = { E1000_STAT("rx_packets", stats.gprc), @@ -65,21 +65,21 @@ static const struct e1000_stats e1000_gstrings_stats[] = { E1000_STAT("tx_broadcast", stats.bptc), E1000_STAT("rx_multicast", stats.mprc), E1000_STAT("tx_multicast", stats.mptc), - E1000_NETDEV_STAT("rx_errors", stats.rx_errors), - E1000_NETDEV_STAT("tx_errors", stats.tx_errors), - E1000_NETDEV_STAT("tx_dropped", stats.tx_dropped), + E1000_NETDEV_STAT("rx_errors", rx_errors), + E1000_NETDEV_STAT("tx_errors", tx_errors), + E1000_NETDEV_STAT("tx_dropped", tx_dropped), E1000_STAT("multicast", stats.mprc), E1000_STAT("collisions", stats.colc), - E1000_NETDEV_STAT("rx_length_errors", stats.rx_length_errors), - E1000_NETDEV_STAT("rx_over_errors", stats.rx_over_errors), + E1000_NETDEV_STAT("rx_length_errors", rx_length_errors), + E1000_NETDEV_STAT("rx_over_errors", rx_over_errors), E1000_STAT("rx_crc_errors", stats.crcerrs), - E1000_NETDEV_STAT("rx_frame_errors", stats.rx_frame_errors), + E1000_NETDEV_STAT("rx_frame_errors", rx_frame_errors), E1000_STAT("rx_no_buffer_count", stats.rnbc), E1000_STAT("rx_missed_errors", stats.mpc), E1000_STAT("tx_aborted_errors", stats.ecol), E1000_STAT("tx_carrier_errors", stats.tncrs), - E1000_NETDEV_STAT("tx_fifo_errors", stats.tx_fifo_errors), - E1000_NETDEV_STAT("tx_heartbeat_errors", stats.tx_heartbeat_errors), + E1000_NETDEV_STAT("tx_fifo_errors", tx_fifo_errors), + E1000_NETDEV_STAT("tx_heartbeat_errors", tx_heartbeat_errors), E1000_STAT("tx_window_errors", stats.latecol), E1000_STAT("tx_abort_late_coll", stats.latecol), E1000_STAT("tx_deferred_ok", stats.dc), @@ -684,20 +684,13 @@ static int e1000_set_ringparam(struct net_device *netdev, rx_old = adapter->rx_ring; err = -ENOMEM; - tx_ring = kzalloc(sizeof(struct e1000_ring), GFP_KERNEL); + tx_ring = kmemdup(tx_old, sizeof(struct e1000_ring), GFP_KERNEL); if (!tx_ring) goto err_alloc_tx; - /* - * use a memcpy to save any previously configured - * items like napi structs from having to be - * reinitialized - */ - memcpy(tx_ring, tx_old, sizeof(struct e1000_ring)); - rx_ring = kzalloc(sizeof(struct e1000_ring), GFP_KERNEL); + rx_ring = kmemdup(rx_old, sizeof(struct e1000_ring), GFP_KERNEL); if (!rx_ring) goto err_alloc_rx; - memcpy(rx_ring, rx_old, sizeof(struct e1000_ring)); adapter->tx_ring = tx_ring; adapter->rx_ring = rx_ring; @@ -1255,7 +1248,6 @@ static int e1000_integrated_phy_loopback(struct e1000_adapter *adapter) { struct e1000_hw *hw = &adapter->hw; u32 ctrl_reg = 0; - u32 stat_reg = 0; u16 phy_reg = 0; s32 ret_val = 0; @@ -1363,8 +1355,7 @@ static int e1000_integrated_phy_loopback(struct e1000_adapter *adapter) * Set the ILOS bit on the fiber Nic if half duplex link is * detected. */ - stat_reg = er32(STATUS); - if ((stat_reg & E1000_STATUS_FD) == 0) + if ((er32(STATUS) & E1000_STATUS_FD) == 0) ctrl_reg |= (E1000_CTRL_ILOS | E1000_CTRL_SLU); } @@ -1982,14 +1973,15 @@ static void e1000_get_ethtool_stats(struct net_device *netdev, u64 *data) { struct e1000_adapter *adapter = netdev_priv(netdev); + struct rtnl_link_stats64 net_stats; int i; char *p = NULL; - e1000e_update_stats(adapter); + e1000e_get_stats64(netdev, &net_stats); for (i = 0; i < E1000_GLOBAL_STATS_LEN; i++) { switch (e1000_gstrings_stats[i].type) { case NETDEV_STATS: - p = (char *) netdev + + p = (char *) &net_stats + e1000_gstrings_stats[i].stat_offset; break; case E1000_STATS: diff --git a/drivers/net/e1000e/ich8lan.c b/drivers/net/e1000e/ich8lan.c index fb46974cfec..232b42b7f7c 100644 --- a/drivers/net/e1000e/ich8lan.c +++ b/drivers/net/e1000e/ich8lan.c @@ -2104,7 +2104,6 @@ static s32 e1000_flash_cycle_init_ich8lan(struct e1000_hw *hw) { union ich8_hws_flash_status hsfsts; s32 ret_val = -E1000_ERR_NVM; - s32 i = 0; hsfsts.regval = er16flash(ICH_FLASH_HSFSTS); @@ -2140,6 +2139,8 @@ static s32 e1000_flash_cycle_init_ich8lan(struct e1000_hw *hw) ew16flash(ICH_FLASH_HSFSTS, hsfsts.regval); ret_val = 0; } else { + s32 i = 0; + /* * Otherwise poll for sometime so the current * cycle has a chance to end before giving up. diff --git a/drivers/net/e1000e/lib.c b/drivers/net/e1000e/lib.c index 68aa1749bf6..96921de5df2 100644 --- a/drivers/net/e1000e/lib.c +++ b/drivers/net/e1000e/lib.c @@ -1978,15 +1978,15 @@ static s32 e1000_ready_nvm_eeprom(struct e1000_hw *hw) { struct e1000_nvm_info *nvm = &hw->nvm; u32 eecd = er32(EECD); - u16 timeout = 0; u8 spi_stat_reg; if (nvm->type == e1000_nvm_eeprom_spi) { + u16 timeout = NVM_MAX_RETRY_SPI; + /* Clear SK and CS */ eecd &= ~(E1000_EECD_CS | E1000_EECD_SK); ew32(EECD, eecd); udelay(1); - timeout = NVM_MAX_RETRY_SPI; /* * Read "Status Register" repeatedly until the LSB is cleared. diff --git a/drivers/net/e1000e/netdev.c b/drivers/net/e1000e/netdev.c index 1c18f26b081..5b916b01805 100644 --- a/drivers/net/e1000e/netdev.c +++ b/drivers/net/e1000e/netdev.c @@ -900,8 +900,6 @@ next_desc: adapter->total_rx_bytes += total_rx_bytes; adapter->total_rx_packets += total_rx_packets; - netdev->stats.rx_bytes += total_rx_bytes; - netdev->stats.rx_packets += total_rx_packets; return cleaned; } @@ -1057,8 +1055,6 @@ static bool e1000_clean_tx_irq(struct e1000_adapter *adapter) } adapter->total_tx_bytes += total_tx_bytes; adapter->total_tx_packets += total_tx_packets; - netdev->stats.tx_bytes += total_tx_bytes; - netdev->stats.tx_packets += total_tx_packets; return count < tx_ring->count; } @@ -1245,8 +1241,6 @@ next_desc: adapter->total_rx_bytes += total_rx_bytes; adapter->total_rx_packets += total_rx_packets; - netdev->stats.rx_bytes += total_rx_bytes; - netdev->stats.rx_packets += total_rx_packets; return cleaned; } @@ -1426,8 +1420,6 @@ next_desc: adapter->total_rx_bytes += total_rx_bytes; adapter->total_rx_packets += total_rx_packets; - netdev->stats.rx_bytes += total_rx_bytes; - netdev->stats.rx_packets += total_rx_packets; return cleaned; } @@ -2728,7 +2720,6 @@ static void e1000_setup_rctl(struct e1000_adapter *adapter) { struct e1000_hw *hw = &adapter->hw; u32 rctl, rfctl; - u32 psrctl = 0; u32 pages = 0; /* Workaround Si errata on 82579 - configure jumbo frame flow */ @@ -2827,6 +2818,8 @@ static void e1000_setup_rctl(struct e1000_adapter *adapter) adapter->rx_ps_pages = 0; if (adapter->rx_ps_pages) { + u32 psrctl = 0; + /* Configure extra packet-split registers */ rfctl = er32(RFCTL); rfctl |= E1000_RFCTL_EXTEN; @@ -3028,7 +3021,6 @@ static void e1000_set_multi(struct net_device *netdev) struct netdev_hw_addr *ha; u8 *mta_list; u32 rctl; - int i; /* Check for Promiscuous and All Multicast modes */ @@ -3051,12 +3043,13 @@ static void e1000_set_multi(struct net_device *netdev) ew32(RCTL, rctl); if (!netdev_mc_empty(netdev)) { + int i = 0; + mta_list = kmalloc(netdev_mc_count(netdev) * 6, GFP_ATOMIC); if (!mta_list) return; /* prepare a packed array of only addresses. */ - i = 0; netdev_for_each_mc_addr(ha, netdev) memcpy(mta_list + (i++ * ETH_ALEN), ha->addr, ETH_ALEN); @@ -3338,6 +3331,8 @@ int e1000e_up(struct e1000_adapter *adapter) return 0; } +static void e1000e_update_stats(struct e1000_adapter *adapter); + void e1000e_down(struct e1000_adapter *adapter) { struct net_device *netdev = adapter->netdev; @@ -3372,6 +3367,11 @@ void e1000e_down(struct e1000_adapter *adapter) del_timer_sync(&adapter->phy_info_timer); netif_carrier_off(netdev); + + spin_lock(&adapter->stats64_lock); + e1000e_update_stats(adapter); + spin_unlock(&adapter->stats64_lock); + adapter->link_speed = 0; adapter->link_duplex = 0; @@ -3413,6 +3413,8 @@ static int __devinit e1000_sw_init(struct e1000_adapter *adapter) adapter->max_frame_size = netdev->mtu + ETH_HLEN + ETH_FCS_LEN; adapter->min_frame_size = ETH_ZLEN + ETH_FCS_LEN; + spin_lock_init(&adapter->stats64_lock); + e1000e_set_interrupt_capability(adapter); if (e1000_alloc_queues(adapter)) @@ -3886,7 +3888,7 @@ release: * e1000e_update_stats - Update the board statistics counters * @adapter: board private structure **/ -void e1000e_update_stats(struct e1000_adapter *adapter) +static void e1000e_update_stats(struct e1000_adapter *adapter) { struct net_device *netdev = adapter->netdev; struct e1000_hw *hw = &adapter->hw; @@ -3998,10 +4000,11 @@ static void e1000_phy_read_status(struct e1000_adapter *adapter) { struct e1000_hw *hw = &adapter->hw; struct e1000_phy_regs *phy = &adapter->phy_regs; - int ret_val; if ((er32(STATUS) & E1000_STATUS_LU) && (adapter->hw.phy.media_type == e1000_media_type_copper)) { + int ret_val; + ret_val = e1e_rphy(hw, PHY_CONTROL, &phy->bmcr); ret_val |= e1e_rphy(hw, PHY_STATUS, &phy->bmsr); ret_val |= e1e_rphy(hw, PHY_AUTONEG_ADV, &phy->advertise); @@ -4147,7 +4150,6 @@ static void e1000_watchdog_task(struct work_struct *work) struct e1000_ring *tx_ring = adapter->tx_ring; struct e1000_hw *hw = &adapter->hw; u32 link, tctl; - int tx_pending = 0; link = e1000e_has_link(adapter); if ((netif_carrier_ok(netdev)) && link) { @@ -4285,7 +4287,9 @@ static void e1000_watchdog_task(struct work_struct *work) } link_up: + spin_lock(&adapter->stats64_lock); e1000e_update_stats(adapter); + spin_unlock(&adapter->stats64_lock); mac->tx_packet_delta = adapter->stats.tpt - adapter->tpt_old; adapter->tpt_old = adapter->stats.tpt; @@ -4299,21 +4303,18 @@ link_up: e1000e_update_adaptive(&adapter->hw); - if (!netif_carrier_ok(netdev)) { - tx_pending = (e1000_desc_unused(tx_ring) + 1 < - tx_ring->count); - if (tx_pending) { - /* - * We've lost link, so the controller stops DMA, - * but we've got queued Tx work that's never going - * to get done, so reset controller to flush Tx. - * (Do the reset outside of interrupt context). - */ - adapter->tx_timeout_count++; - schedule_work(&adapter->reset_task); - /* return immediately since reset is imminent */ - return; - } + if (!netif_carrier_ok(netdev) && + (e1000_desc_unused(tx_ring) + 1 < tx_ring->count)) { + /* + * We've lost link, so the controller stops DMA, + * but we've got queued Tx work that's never going + * to get done, so reset controller to flush Tx. + * (Do the reset outside of interrupt context). + */ + adapter->tx_timeout_count++; + schedule_work(&adapter->reset_task); + /* return immediately since reset is imminent */ + return; } /* Simple mode for Interrupt Throttle Rate (ITR) */ @@ -4384,13 +4385,13 @@ static int e1000_tso(struct e1000_adapter *adapter, u32 cmd_length = 0; u16 ipcse = 0, tucse, mss; u8 ipcss, ipcso, tucss, tucso, hdr_len; - int err; if (!skb_is_gso(skb)) return 0; if (skb_header_cloned(skb)) { - err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC); + int err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC); + if (err) return err; } @@ -4897,16 +4898,55 @@ static void e1000_reset_task(struct work_struct *work) } /** - * e1000_get_stats - Get System Network Statistics + * e1000_get_stats64 - Get System Network Statistics * @netdev: network interface device structure + * @stats: rtnl_link_stats64 pointer * * Returns the address of the device statistics structure. - * The statistics are actually updated from the timer callback. **/ -static struct net_device_stats *e1000_get_stats(struct net_device *netdev) +struct rtnl_link_stats64 *e1000e_get_stats64(struct net_device *netdev, + struct rtnl_link_stats64 *stats) { - /* only return the current stats */ - return &netdev->stats; + struct e1000_adapter *adapter = netdev_priv(netdev); + + memset(stats, 0, sizeof(struct rtnl_link_stats64)); + spin_lock(&adapter->stats64_lock); + e1000e_update_stats(adapter); + /* Fill out the OS statistics structure */ + stats->rx_bytes = adapter->stats.gorc; + stats->rx_packets = adapter->stats.gprc; + stats->tx_bytes = adapter->stats.gotc; + stats->tx_packets = adapter->stats.gptc; + stats->multicast = adapter->stats.mprc; + stats->collisions = adapter->stats.colc; + + /* Rx Errors */ + + /* + * RLEC on some newer hardware can be incorrect so build + * our own version based on RUC and ROC + */ + stats->rx_errors = adapter->stats.rxerrc + + adapter->stats.crcerrs + adapter->stats.algnerrc + + adapter->stats.ruc + adapter->stats.roc + + adapter->stats.cexterr; + stats->rx_length_errors = adapter->stats.ruc + + adapter->stats.roc; + stats->rx_crc_errors = adapter->stats.crcerrs; + stats->rx_frame_errors = adapter->stats.algnerrc; + stats->rx_missed_errors = adapter->stats.mpc; + + /* Tx Errors */ + stats->tx_errors = adapter->stats.ecol + + adapter->stats.latecol; + stats->tx_aborted_errors = adapter->stats.ecol; + stats->tx_window_errors = adapter->stats.latecol; + stats->tx_carrier_errors = adapter->stats.tncrs; + + /* Tx Dropped needs to be maintained elsewhere */ + + spin_unlock(&adapter->stats64_lock); + return stats; } /** @@ -5476,9 +5516,10 @@ static irqreturn_t e1000_intr_msix(int irq, void *data) { struct net_device *netdev = data; struct e1000_adapter *adapter = netdev_priv(netdev); - int vector, msix_irq; if (adapter->msix_entries) { + int vector, msix_irq; + vector = 0; msix_irq = adapter->msix_entries[vector].vector; disable_irq(msix_irq); @@ -5675,7 +5716,7 @@ static const struct net_device_ops e1000e_netdev_ops = { .ndo_open = e1000_open, .ndo_stop = e1000_close, .ndo_start_xmit = e1000_xmit_frame, - .ndo_get_stats = e1000_get_stats, + .ndo_get_stats64 = e1000e_get_stats64, .ndo_set_multicast_list = e1000_set_multi, .ndo_set_mac_address = e1000_set_mac, .ndo_change_mtu = e1000_change_mtu, diff --git a/drivers/net/e1000e/phy.c b/drivers/net/e1000e/phy.c index 6bea051b134..6ae31fcfb62 100644 --- a/drivers/net/e1000e/phy.c +++ b/drivers/net/e1000e/phy.c @@ -2409,9 +2409,7 @@ static u32 e1000_get_phy_addr_for_bm_page(u32 page, u32 reg) s32 e1000e_write_phy_reg_bm(struct e1000_hw *hw, u32 offset, u16 data) { s32 ret_val; - u32 page_select = 0; u32 page = offset >> IGP_PAGE_SHIFT; - u32 page_shift = 0; ret_val = hw->phy.ops.acquire(hw); if (ret_val) @@ -2427,6 +2425,8 @@ s32 e1000e_write_phy_reg_bm(struct e1000_hw *hw, u32 offset, u16 data) hw->phy.addr = e1000_get_phy_addr_for_bm_page(page, offset); if (offset > MAX_PHY_MULTI_PAGE_REG) { + u32 page_shift, page_select; + /* * Page select is register 31 for phy address 1 and 22 for * phy address 2 and 3. Page select is shifted only for @@ -2468,9 +2468,7 @@ out: s32 e1000e_read_phy_reg_bm(struct e1000_hw *hw, u32 offset, u16 *data) { s32 ret_val; - u32 page_select = 0; u32 page = offset >> IGP_PAGE_SHIFT; - u32 page_shift = 0; ret_val = hw->phy.ops.acquire(hw); if (ret_val) @@ -2486,6 +2484,8 @@ s32 e1000e_read_phy_reg_bm(struct e1000_hw *hw, u32 offset, u16 *data) hw->phy.addr = e1000_get_phy_addr_for_bm_page(page, offset); if (offset > MAX_PHY_MULTI_PAGE_REG) { + u32 page_shift, page_select; + /* * Page select is register 31 for phy address 1 and 22 for * phy address 2 and 3. Page select is shifted only for diff --git a/drivers/net/enic/enic.h b/drivers/net/enic/enic.h index a937f49d9db..ca3be4f1555 100644 --- a/drivers/net/enic/enic.h +++ b/drivers/net/enic/enic.h @@ -32,8 +32,8 @@ #define DRV_NAME "enic" #define DRV_DESCRIPTION "Cisco VIC Ethernet NIC Driver" -#define DRV_VERSION "1.4.1.10" -#define DRV_COPYRIGHT "Copyright 2008-2010 Cisco Systems, Inc" +#define DRV_VERSION "2.1.1.2" +#define DRV_COPYRIGHT "Copyright 2008-2011 Cisco Systems, Inc" #define ENIC_BARS_MAX 6 @@ -49,7 +49,7 @@ struct enic_msix_entry { void *devid; }; -#define ENIC_SET_APPLIED (1 << 0) +#define ENIC_PORT_REQUEST_APPLIED (1 << 0) #define ENIC_SET_REQUEST (1 << 1) #define ENIC_SET_NAME (1 << 2) #define ENIC_SET_INSTANCE (1 << 3) diff --git a/drivers/net/enic/enic_main.c b/drivers/net/enic/enic_main.c index a0af48c51fb..89664c67097 100644 --- a/drivers/net/enic/enic_main.c +++ b/drivers/net/enic/enic_main.c @@ -1318,18 +1318,20 @@ static int enic_set_port_profile(struct enic *enic, u8 *mac) vic_provinfo_free(vp); if (err) return err; - - enic->pp.set |= ENIC_SET_APPLIED; break; case PORT_REQUEST_DISASSOCIATE: - enic->pp.set &= ~ENIC_SET_APPLIED; break; default: return -EINVAL; } + /* Set flag to indicate that the port assoc/disassoc + * request has been sent out to fw + */ + enic->pp.set |= ENIC_PORT_REQUEST_APPLIED; + return 0; } @@ -1411,7 +1413,7 @@ static int enic_get_vf_port(struct net_device *netdev, int vf, int err, error, done; u16 response = PORT_PROFILE_RESPONSE_SUCCESS; - if (!(enic->pp.set & ENIC_SET_APPLIED)) + if (!(enic->pp.set & ENIC_PORT_REQUEST_APPLIED)) return -ENODATA; err = enic_dev_init_done(enic, &done, &error); diff --git a/drivers/net/igb/e1000_82575.c b/drivers/net/igb/e1000_82575.c index 0a2368fa6bc..c1552b6f4a6 100644 --- a/drivers/net/igb/e1000_82575.c +++ b/drivers/net/igb/e1000_82575.c @@ -129,6 +129,7 @@ static s32 igb_get_invariants_82575(struct e1000_hw *hw) break; case E1000_DEV_ID_82580_COPPER: case E1000_DEV_ID_82580_FIBER: + case E1000_DEV_ID_82580_QUAD_FIBER: case E1000_DEV_ID_82580_SERDES: case E1000_DEV_ID_82580_SGMII: case E1000_DEV_ID_82580_COPPER_DUAL: diff --git a/drivers/net/igb/e1000_hw.h b/drivers/net/igb/e1000_hw.h index e2638afb8cd..281324e8598 100644 --- a/drivers/net/igb/e1000_hw.h +++ b/drivers/net/igb/e1000_hw.h @@ -54,6 +54,7 @@ struct e1000_hw; #define E1000_DEV_ID_82580_SERDES 0x1510 #define E1000_DEV_ID_82580_SGMII 0x1511 #define E1000_DEV_ID_82580_COPPER_DUAL 0x1516 +#define E1000_DEV_ID_82580_QUAD_FIBER 0x1527 #define E1000_DEV_ID_DH89XXCC_SGMII 0x0438 #define E1000_DEV_ID_DH89XXCC_SERDES 0x043A #define E1000_DEV_ID_DH89XXCC_BACKPLANE 0x043C diff --git a/drivers/net/igb/igb_main.c b/drivers/net/igb/igb_main.c index 58c665b7513..200cc320967 100644 --- a/drivers/net/igb/igb_main.c +++ b/drivers/net/igb/igb_main.c @@ -68,6 +68,7 @@ static DEFINE_PCI_DEVICE_TABLE(igb_pci_tbl) = { { PCI_VDEVICE(INTEL, E1000_DEV_ID_I350_SGMII), board_82575 }, { PCI_VDEVICE(INTEL, E1000_DEV_ID_82580_COPPER), board_82575 }, { PCI_VDEVICE(INTEL, E1000_DEV_ID_82580_FIBER), board_82575 }, + { PCI_VDEVICE(INTEL, E1000_DEV_ID_82580_QUAD_FIBER), board_82575 }, { PCI_VDEVICE(INTEL, E1000_DEV_ID_82580_SERDES), board_82575 }, { PCI_VDEVICE(INTEL, E1000_DEV_ID_82580_SGMII), board_82575 }, { PCI_VDEVICE(INTEL, E1000_DEV_ID_82580_COPPER_DUAL), board_82575 }, diff --git a/drivers/net/ppp_generic.c b/drivers/net/ppp_generic.c index c7a6c446697..9f6d670748d 100644 --- a/drivers/net/ppp_generic.c +++ b/drivers/net/ppp_generic.c @@ -592,8 +592,8 @@ static long ppp_ioctl(struct file *file, unsigned int cmd, unsigned long arg) ppp_release(NULL, file); err = 0; } else - printk(KERN_DEBUG "PPPIOCDETACH file->f_count=%ld\n", - atomic_long_read(&file->f_count)); + pr_warn("PPPIOCDETACH file->f_count=%ld\n", + atomic_long_read(&file->f_count)); mutex_unlock(&ppp_mutex); return err; } @@ -630,7 +630,7 @@ static long ppp_ioctl(struct file *file, unsigned int cmd, unsigned long arg) if (pf->kind != INTERFACE) { /* can't happen */ - printk(KERN_ERR "PPP: not interface or channel??\n"); + pr_err("PPP: not interface or channel??\n"); return -EINVAL; } @@ -704,7 +704,8 @@ static long ppp_ioctl(struct file *file, unsigned int cmd, unsigned long arg) } vj = slhc_init(val2+1, val+1); if (!vj) { - printk(KERN_ERR "PPP: no memory (VJ compressor)\n"); + netdev_err(ppp->dev, + "PPP: no memory (VJ compressor)\n"); err = -ENOMEM; break; } @@ -898,17 +899,17 @@ static int __init ppp_init(void) { int err; - printk(KERN_INFO "PPP generic driver version " PPP_VERSION "\n"); + pr_info("PPP generic driver version " PPP_VERSION "\n"); err = register_pernet_device(&ppp_net_ops); if (err) { - printk(KERN_ERR "failed to register PPP pernet device (%d)\n", err); + pr_err("failed to register PPP pernet device (%d)\n", err); goto out; } err = register_chrdev(PPP_MAJOR, "ppp", &ppp_device_fops); if (err) { - printk(KERN_ERR "failed to register PPP device (%d)\n", err); + pr_err("failed to register PPP device (%d)\n", err); goto out_net; } @@ -1078,7 +1079,7 @@ pad_compress_skb(struct ppp *ppp, struct sk_buff *skb) new_skb = alloc_skb(new_skb_size, GFP_ATOMIC); if (!new_skb) { if (net_ratelimit()) - printk(KERN_ERR "PPP: no memory (comp pkt)\n"); + netdev_err(ppp->dev, "PPP: no memory (comp pkt)\n"); return NULL; } if (ppp->dev->hard_header_len > PPP_HDRLEN) @@ -1108,7 +1109,7 @@ pad_compress_skb(struct ppp *ppp, struct sk_buff *skb) * the same number. */ if (net_ratelimit()) - printk(KERN_ERR "ppp: compressor dropped pkt\n"); + netdev_err(ppp->dev, "ppp: compressor dropped pkt\n"); kfree_skb(skb); kfree_skb(new_skb); new_skb = NULL; @@ -1138,7 +1139,9 @@ ppp_send_frame(struct ppp *ppp, struct sk_buff *skb) if (ppp->pass_filter && sk_run_filter(skb, ppp->pass_filter) == 0) { if (ppp->debug & 1) - printk(KERN_DEBUG "PPP: outbound frame not passed\n"); + netdev_printk(KERN_DEBUG, ppp->dev, + "PPP: outbound frame " + "not passed\n"); kfree_skb(skb); return; } @@ -1164,7 +1167,7 @@ ppp_send_frame(struct ppp *ppp, struct sk_buff *skb) new_skb = alloc_skb(skb->len + ppp->dev->hard_header_len - 2, GFP_ATOMIC); if (!new_skb) { - printk(KERN_ERR "PPP: no memory (VJ comp pkt)\n"); + netdev_err(ppp->dev, "PPP: no memory (VJ comp pkt)\n"); goto drop; } skb_reserve(new_skb, ppp->dev->hard_header_len - 2); @@ -1202,7 +1205,9 @@ ppp_send_frame(struct ppp *ppp, struct sk_buff *skb) proto != PPP_LCP && proto != PPP_CCP) { if (!(ppp->flags & SC_CCP_UP) && (ppp->flags & SC_MUST_COMP)) { if (net_ratelimit()) - printk(KERN_ERR "ppp: compression required but down - pkt dropped.\n"); + netdev_err(ppp->dev, + "ppp: compression required but " + "down - pkt dropped.\n"); goto drop; } skb = pad_compress_skb(ppp, skb); @@ -1505,7 +1510,7 @@ static int ppp_mp_explode(struct ppp *ppp, struct sk_buff *skb) noskb: spin_unlock_bh(&pch->downl); if (ppp->debug & 1) - printk(KERN_ERR "PPP: no memory (fragment)\n"); + netdev_err(ppp->dev, "PPP: no memory (fragment)\n"); ++ppp->dev->stats.tx_errors; ++ppp->nxseq; return 1; /* abandon the frame */ @@ -1686,7 +1691,8 @@ ppp_receive_nonmp_frame(struct ppp *ppp, struct sk_buff *skb) /* copy to a new sk_buff with more tailroom */ ns = dev_alloc_skb(skb->len + 128); if (!ns) { - printk(KERN_ERR"PPP: no memory (VJ decomp)\n"); + netdev_err(ppp->dev, "PPP: no memory " + "(VJ decomp)\n"); goto err; } skb_reserve(ns, 2); @@ -1699,7 +1705,8 @@ ppp_receive_nonmp_frame(struct ppp *ppp, struct sk_buff *skb) len = slhc_uncompress(ppp->vj, skb->data + 2, skb->len - 2); if (len <= 0) { - printk(KERN_DEBUG "PPP: VJ decompression error\n"); + netdev_printk(KERN_DEBUG, ppp->dev, + "PPP: VJ decompression error\n"); goto err; } len += 2; @@ -1721,7 +1728,7 @@ ppp_receive_nonmp_frame(struct ppp *ppp, struct sk_buff *skb) goto err; if (slhc_remember(ppp->vj, skb->data + 2, skb->len - 2) <= 0) { - printk(KERN_ERR "PPP: VJ uncompressed error\n"); + netdev_err(ppp->dev, "PPP: VJ uncompressed error\n"); goto err; } proto = PPP_IP; @@ -1762,8 +1769,9 @@ ppp_receive_nonmp_frame(struct ppp *ppp, struct sk_buff *skb) if (ppp->pass_filter && sk_run_filter(skb, ppp->pass_filter) == 0) { if (ppp->debug & 1) - printk(KERN_DEBUG "PPP: inbound frame " - "not passed\n"); + netdev_printk(KERN_DEBUG, ppp->dev, + "PPP: inbound frame " + "not passed\n"); kfree_skb(skb); return; } @@ -1821,7 +1829,8 @@ ppp_decompress_frame(struct ppp *ppp, struct sk_buff *skb) ns = dev_alloc_skb(obuff_size); if (!ns) { - printk(KERN_ERR "ppp_decompress_frame: no memory\n"); + netdev_err(ppp->dev, "ppp_decompress_frame: " + "no memory\n"); goto err; } /* the decompressor still expects the A/C bytes in the hdr */ @@ -1989,7 +1998,7 @@ ppp_mp_reconstruct(struct ppp *ppp) u32 seq = ppp->nextseq; u32 minseq = ppp->minseq; struct sk_buff_head *list = &ppp->mrq; - struct sk_buff *p, *next; + struct sk_buff *p, *tmp; struct sk_buff *head, *tail; struct sk_buff *skb = NULL; int lost = 0, len = 0; @@ -1998,13 +2007,15 @@ ppp_mp_reconstruct(struct ppp *ppp) return NULL; head = list->next; tail = NULL; - for (p = head; p != (struct sk_buff *) list; p = next) { - next = p->next; + skb_queue_walk_safe(list, p, tmp) { + again: if (seq_before(PPP_MP_CB(p)->sequence, seq)) { /* this can't happen, anyway ignore the skb */ - printk(KERN_ERR "ppp_mp_reconstruct bad seq %u < %u\n", - PPP_MP_CB(p)->sequence, seq); - head = next; + netdev_err(ppp->dev, "ppp_mp_reconstruct bad " + "seq %u < %u\n", + PPP_MP_CB(p)->sequence, seq); + __skb_unlink(p, list); + kfree_skb(p); continue; } if (PPP_MP_CB(p)->sequence != seq) { @@ -2016,8 +2027,7 @@ ppp_mp_reconstruct(struct ppp *ppp) lost = 1; seq = seq_before(minseq, PPP_MP_CB(p)->sequence)? minseq + 1: PPP_MP_CB(p)->sequence; - next = p; - continue; + goto again; } /* @@ -2042,17 +2052,9 @@ ppp_mp_reconstruct(struct ppp *ppp) (PPP_MP_CB(head)->BEbits & B)) { if (len > ppp->mrru + 2) { ++ppp->dev->stats.rx_length_errors; - printk(KERN_DEBUG "PPP: reconstructed packet" - " is too long (%d)\n", len); - } else if (p == head) { - /* fragment is complete packet - reuse skb */ - tail = p; - skb = skb_get(p); - break; - } else if ((skb = dev_alloc_skb(len)) == NULL) { - ++ppp->dev->stats.rx_missed_errors; - printk(KERN_DEBUG "PPP: no memory for " - "reconstructed packet"); + netdev_printk(KERN_DEBUG, ppp->dev, + "PPP: reconstructed packet" + " is too long (%d)\n", len); } else { tail = p; break; @@ -2065,9 +2067,17 @@ ppp_mp_reconstruct(struct ppp *ppp) * and we haven't found a complete valid packet yet, * we can discard up to and including this fragment. */ - if (PPP_MP_CB(p)->BEbits & E) - head = next; + if (PPP_MP_CB(p)->BEbits & E) { + struct sk_buff *tmp2; + skb_queue_reverse_walk_from_safe(list, p, tmp2) { + __skb_unlink(p, list); + kfree_skb(p); + } + head = skb_peek(list); + if (!head) + break; + } ++seq; } @@ -2077,26 +2087,37 @@ ppp_mp_reconstruct(struct ppp *ppp) signal a receive error. */ if (PPP_MP_CB(head)->sequence != ppp->nextseq) { if (ppp->debug & 1) - printk(KERN_DEBUG " missed pkts %u..%u\n", - ppp->nextseq, - PPP_MP_CB(head)->sequence-1); + netdev_printk(KERN_DEBUG, ppp->dev, + " missed pkts %u..%u\n", + ppp->nextseq, + PPP_MP_CB(head)->sequence-1); ++ppp->dev->stats.rx_dropped; ppp_receive_error(ppp); } - if (head != tail) - /* copy to a single skb */ - for (p = head; p != tail->next; p = p->next) - skb_copy_bits(p, 0, skb_put(skb, p->len), p->len); - ppp->nextseq = PPP_MP_CB(tail)->sequence + 1; - head = tail->next; - } + skb = head; + if (head != tail) { + struct sk_buff **fragpp = &skb_shinfo(skb)->frag_list; + p = skb_queue_next(list, head); + __skb_unlink(skb, list); + skb_queue_walk_from_safe(list, p, tmp) { + __skb_unlink(p, list); + *fragpp = p; + p->next = NULL; + fragpp = &p->next; + + skb->len += p->len; + skb->data_len += p->len; + skb->truesize += p->len; + + if (p == tail) + break; + } + } else { + __skb_unlink(skb, list); + } - /* Discard all the skbuffs that we have copied the data out of - or that we can't use. */ - while ((p = list->next) != head) { - __skb_unlink(p, list); - kfree_skb(p); + ppp->nextseq = PPP_MP_CB(tail)->sequence + 1; } return skb; @@ -2617,8 +2638,8 @@ ppp_create_interface(struct net *net, int unit, int *retp) ret = register_netdev(dev); if (ret != 0) { unit_put(&pn->units_idr, unit); - printk(KERN_ERR "PPP: couldn't register device %s (%d)\n", - dev->name, ret); + netdev_err(ppp->dev, "PPP: couldn't register device %s (%d)\n", + dev->name, ret); goto out2; } @@ -2690,9 +2711,9 @@ static void ppp_destroy_interface(struct ppp *ppp) if (!ppp->file.dead || ppp->n_channels) { /* "can't happen" */ - printk(KERN_ERR "ppp: destroying ppp struct %p but dead=%d " - "n_channels=%d !\n", ppp, ppp->file.dead, - ppp->n_channels); + netdev_err(ppp->dev, "ppp: destroying ppp struct %p " + "but dead=%d n_channels=%d !\n", + ppp, ppp->file.dead, ppp->n_channels); return; } @@ -2834,8 +2855,7 @@ static void ppp_destroy_channel(struct channel *pch) if (!pch->file.dead) { /* "can't happen" */ - printk(KERN_ERR "ppp: destroying undead channel %p !\n", - pch); + pr_err("ppp: destroying undead channel %p !\n", pch); return; } skb_queue_purge(&pch->file.xq); @@ -2847,7 +2867,7 @@ static void __exit ppp_cleanup(void) { /* should never happen */ if (atomic_read(&ppp_unit_count) || atomic_read(&channel_count)) - printk(KERN_ERR "PPP: removing module but units remain!\n"); + pr_err("PPP: removing module but units remain!\n"); unregister_chrdev(PPP_MAJOR, "ppp"); device_destroy(ppp_class, MKDEV(PPP_MAJOR, 0)); class_destroy(ppp_class); @@ -2865,7 +2885,7 @@ static int __unit_alloc(struct idr *p, void *ptr, int n) again: if (!idr_pre_get(p, GFP_KERNEL)) { - printk(KERN_ERR "PPP: No free memory for idr\n"); + pr_err("PPP: No free memory for idr\n"); return -ENOMEM; } diff --git a/drivers/net/via-velocity.c b/drivers/net/via-velocity.c index 09cac704fdd..0d6fec6b7d9 100644 --- a/drivers/net/via-velocity.c +++ b/drivers/net/via-velocity.c @@ -2923,6 +2923,7 @@ static u16 wol_calc_crc(int size, u8 *pattern, u8 *mask_pattern) static int velocity_set_wol(struct velocity_info *vptr) { struct mac_regs __iomem *regs = vptr->mac_regs; + enum speed_opt spd_dpx = vptr->options.spd_dpx; static u8 buf[256]; int i; @@ -2968,6 +2969,12 @@ static int velocity_set_wol(struct velocity_info *vptr) writew(0x0FFF, ®s->WOLSRClr); + if (spd_dpx == SPD_DPX_1000_FULL) + goto mac_done; + + if (spd_dpx != SPD_DPX_AUTO) + goto advertise_done; + if (vptr->mii_status & VELOCITY_AUTONEG_ENABLE) { if (PHYID_GET_PHY_ID(vptr->phy_id) == PHYID_CICADA_CS8201) MII_REG_BITS_ON(AUXCR_MDPPS, MII_NCONFIG, vptr->mac_regs); @@ -2978,6 +2985,7 @@ static int velocity_set_wol(struct velocity_info *vptr) if (vptr->mii_status & VELOCITY_SPEED_1000) MII_REG_BITS_ON(BMCR_ANRESTART, MII_BMCR, vptr->mac_regs); +advertise_done: BYTE_REG_BITS_ON(CHIPGCR_FCMODE, ®s->CHIPGCR); { @@ -2987,6 +2995,7 @@ static int velocity_set_wol(struct velocity_info *vptr) writeb(GCR, ®s->CHIPGCR); } +mac_done: BYTE_REG_BITS_OFF(ISR_PWEI, ®s->ISR); /* Turn on SWPTAG just before entering power mode */ BYTE_REG_BITS_ON(STICKHW_SWPTAG, ®s->STICKHW); diff --git a/drivers/net/via-velocity.h b/drivers/net/via-velocity.h index aa2e69b9ff6..d7227539484 100644 --- a/drivers/net/via-velocity.h +++ b/drivers/net/via-velocity.h @@ -361,7 +361,7 @@ enum velocity_owner { #define MAC_REG_CHIPGSR 0x9C #define MAC_REG_TESTCFG 0x9D #define MAC_REG_DEBUG 0x9E -#define MAC_REG_CHIPGCR 0x9F +#define MAC_REG_CHIPGCR 0x9F /* Chip Operation and Diagnostic Control */ #define MAC_REG_WOLCR0_SET 0xA0 #define MAC_REG_WOLCR1_SET 0xA1 #define MAC_REG_PWCFG_SET 0xA2 @@ -848,10 +848,10 @@ enum velocity_owner { * Bits in CHIPGCR register */ -#define CHIPGCR_FCGMII 0x80 /* enable GMII mode */ -#define CHIPGCR_FCFDX 0x40 +#define CHIPGCR_FCGMII 0x80 /* force GMII (else MII only) */ +#define CHIPGCR_FCFDX 0x40 /* force full duplex */ #define CHIPGCR_FCRESV 0x20 -#define CHIPGCR_FCMODE 0x10 +#define CHIPGCR_FCMODE 0x10 /* enable MAC forced mode */ #define CHIPGCR_LPSOPT 0x08 #define CHIPGCR_TM1US 0x04 #define CHIPGCR_TM0US 0x02 diff --git a/drivers/net/vxge/vxge-config.c b/drivers/net/vxge/vxge-config.c index 01c05f53e2f..77097e383cf 100644 --- a/drivers/net/vxge/vxge-config.c +++ b/drivers/net/vxge/vxge-config.c @@ -387,8 +387,8 @@ vxge_hw_vpath_eprom_img_ver_get(struct __vxge_hw_device *hldev, data1 = steer_ctrl = 0; status = vxge_hw_vpath_fw_api(vpath, - VXGE_HW_RTS_ACCESS_STEER_CTRL_DATA_STRUCT_SEL_FW_MEMO, VXGE_HW_FW_API_GET_EPROM_REV, + VXGE_HW_RTS_ACCESS_STEER_CTRL_DATA_STRUCT_SEL_FW_MEMO, 0, &data0, &data1, &steer_ctrl); if (status != VXGE_HW_OK) break; @@ -2868,6 +2868,8 @@ __vxge_hw_ring_create(struct __vxge_hw_vpath_handle *vp, ring->rxd_init = attr->rxd_init; ring->rxd_term = attr->rxd_term; ring->buffer_mode = config->buffer_mode; + ring->tim_rti_cfg1_saved = vp->vpath->tim_rti_cfg1_saved; + ring->tim_rti_cfg3_saved = vp->vpath->tim_rti_cfg3_saved; ring->rxds_limit = config->rxds_limit; ring->rxd_size = vxge_hw_ring_rxd_size_get(config->buffer_mode); @@ -3511,6 +3513,8 @@ __vxge_hw_fifo_create(struct __vxge_hw_vpath_handle *vp, /* apply "interrupts per txdl" attribute */ fifo->interrupt_type = VXGE_HW_FIFO_TXD_INT_TYPE_UTILZ; + fifo->tim_tti_cfg1_saved = vpath->tim_tti_cfg1_saved; + fifo->tim_tti_cfg3_saved = vpath->tim_tti_cfg3_saved; if (fifo->config->intr) fifo->interrupt_type = VXGE_HW_FIFO_TXD_INT_TYPE_PER_LIST; @@ -4377,6 +4381,8 @@ __vxge_hw_vpath_tim_configure(struct __vxge_hw_device *hldev, u32 vp_id) } writeq(val64, &vp_reg->tim_cfg1_int_num[VXGE_HW_VPATH_INTR_TX]); + vpath->tim_tti_cfg1_saved = val64; + val64 = readq(&vp_reg->tim_cfg2_int_num[VXGE_HW_VPATH_INTR_TX]); if (config->tti.uec_a != VXGE_HW_USE_FLASH_DEFAULT) { @@ -4433,6 +4439,7 @@ __vxge_hw_vpath_tim_configure(struct __vxge_hw_device *hldev, u32 vp_id) } writeq(val64, &vp_reg->tim_cfg3_int_num[VXGE_HW_VPATH_INTR_TX]); + vpath->tim_tti_cfg3_saved = val64; } if (config->ring.enable == VXGE_HW_RING_ENABLE) { @@ -4481,6 +4488,8 @@ __vxge_hw_vpath_tim_configure(struct __vxge_hw_device *hldev, u32 vp_id) } writeq(val64, &vp_reg->tim_cfg1_int_num[VXGE_HW_VPATH_INTR_RX]); + vpath->tim_rti_cfg1_saved = val64; + val64 = readq(&vp_reg->tim_cfg2_int_num[VXGE_HW_VPATH_INTR_RX]); if (config->rti.uec_a != VXGE_HW_USE_FLASH_DEFAULT) { @@ -4537,6 +4546,7 @@ __vxge_hw_vpath_tim_configure(struct __vxge_hw_device *hldev, u32 vp_id) } writeq(val64, &vp_reg->tim_cfg3_int_num[VXGE_HW_VPATH_INTR_RX]); + vpath->tim_rti_cfg3_saved = val64; } val64 = 0; @@ -4555,26 +4565,6 @@ __vxge_hw_vpath_tim_configure(struct __vxge_hw_device *hldev, u32 vp_id) return status; } -void vxge_hw_vpath_tti_ci_set(struct __vxge_hw_device *hldev, u32 vp_id) -{ - struct __vxge_hw_virtualpath *vpath; - struct vxge_hw_vpath_reg __iomem *vp_reg; - struct vxge_hw_vp_config *config; - u64 val64; - - vpath = &hldev->virtual_paths[vp_id]; - vp_reg = vpath->vp_reg; - config = vpath->vp_config; - - if (config->fifo.enable == VXGE_HW_FIFO_ENABLE && - config->tti.timer_ci_en != VXGE_HW_TIM_TIMER_CI_ENABLE) { - config->tti.timer_ci_en = VXGE_HW_TIM_TIMER_CI_ENABLE; - val64 = readq(&vp_reg->tim_cfg1_int_num[VXGE_HW_VPATH_INTR_TX]); - val64 |= VXGE_HW_TIM_CFG1_INT_NUM_TIMER_CI; - writeq(val64, &vp_reg->tim_cfg1_int_num[VXGE_HW_VPATH_INTR_TX]); - } -} - /* * __vxge_hw_vpath_initialize * This routine is the final phase of init which initializes the diff --git a/drivers/net/vxge/vxge-config.h b/drivers/net/vxge/vxge-config.h index e249e288d16..3c53aa732c9 100644 --- a/drivers/net/vxge/vxge-config.h +++ b/drivers/net/vxge/vxge-config.h @@ -682,6 +682,10 @@ struct __vxge_hw_virtualpath { u32 vsport_number; u32 max_kdfc_db; u32 max_nofl_db; + u64 tim_tti_cfg1_saved; + u64 tim_tti_cfg3_saved; + u64 tim_rti_cfg1_saved; + u64 tim_rti_cfg3_saved; struct __vxge_hw_ring *____cacheline_aligned ringh; struct __vxge_hw_fifo *____cacheline_aligned fifoh; @@ -921,6 +925,9 @@ struct __vxge_hw_ring { u32 doorbell_cnt; u32 total_db_cnt; u64 rxds_limit; + u32 rtimer; + u64 tim_rti_cfg1_saved; + u64 tim_rti_cfg3_saved; enum vxge_hw_status (*callback)( struct __vxge_hw_ring *ringh, @@ -1000,6 +1007,9 @@ struct __vxge_hw_fifo { u32 per_txdl_space; u32 vp_id; u32 tx_intr_num; + u32 rtimer; + u64 tim_tti_cfg1_saved; + u64 tim_tti_cfg3_saved; enum vxge_hw_status (*callback)( struct __vxge_hw_fifo *fifo_handle, diff --git a/drivers/net/vxge/vxge-main.c b/drivers/net/vxge/vxge-main.c index c81a6512c68..e40f619b62b 100644 --- a/drivers/net/vxge/vxge-main.c +++ b/drivers/net/vxge/vxge-main.c @@ -371,9 +371,6 @@ vxge_rx_1b_compl(struct __vxge_hw_ring *ringh, void *dtr, struct vxge_hw_ring_rxd_info ext_info; vxge_debug_entryexit(VXGE_TRACE, "%s: %s:%d", ring->ndev->name, __func__, __LINE__); - ring->pkts_processed = 0; - - vxge_hw_ring_replenish(ringh); do { prefetch((char *)dtr + L1_CACHE_BYTES); @@ -1588,6 +1585,36 @@ static int vxge_reset_vpath(struct vxgedev *vdev, int vp_id) return ret; } +/* Configure CI */ +static void vxge_config_ci_for_tti_rti(struct vxgedev *vdev) +{ + int i = 0; + + /* Enable CI for RTI */ + if (vdev->config.intr_type == MSI_X) { + for (i = 0; i < vdev->no_of_vpath; i++) { + struct __vxge_hw_ring *hw_ring; + + hw_ring = vdev->vpaths[i].ring.handle; + vxge_hw_vpath_dynamic_rti_ci_set(hw_ring); + } + } + + /* Enable CI for TTI */ + for (i = 0; i < vdev->no_of_vpath; i++) { + struct __vxge_hw_fifo *hw_fifo = vdev->vpaths[i].fifo.handle; + vxge_hw_vpath_tti_ci_set(hw_fifo); + /* + * For Inta (with or without napi), Set CI ON for only one + * vpath. (Have only one free running timer). + */ + if ((vdev->config.intr_type == INTA) && (i == 0)) + break; + } + + return; +} + static int do_vxge_reset(struct vxgedev *vdev, int event) { enum vxge_hw_status status; @@ -1753,6 +1780,9 @@ static int do_vxge_reset(struct vxgedev *vdev, int event) netif_tx_wake_all_queues(vdev->ndev); } + /* configure CI */ + vxge_config_ci_for_tti_rti(vdev); + out: vxge_debug_entryexit(VXGE_TRACE, "%s:%d Exiting...", __func__, __LINE__); @@ -1793,22 +1823,29 @@ static void vxge_reset(struct work_struct *work) */ static int vxge_poll_msix(struct napi_struct *napi, int budget) { - struct vxge_ring *ring = - container_of(napi, struct vxge_ring, napi); + struct vxge_ring *ring = container_of(napi, struct vxge_ring, napi); + int pkts_processed; int budget_org = budget; - ring->budget = budget; + ring->budget = budget; + ring->pkts_processed = 0; vxge_hw_vpath_poll_rx(ring->handle); + pkts_processed = ring->pkts_processed; if (ring->pkts_processed < budget_org) { napi_complete(napi); + /* Re enable the Rx interrupts for the vpath */ vxge_hw_channel_msix_unmask( (struct __vxge_hw_channel *)ring->handle, ring->rx_vector_no); + mmiowb(); } - return ring->pkts_processed; + /* We are copying and returning the local variable, in case if after + * clearing the msix interrupt above, if the interrupt fires right + * away which can preempt this NAPI thread */ + return pkts_processed; } static int vxge_poll_inta(struct napi_struct *napi, int budget) @@ -1824,6 +1861,7 @@ static int vxge_poll_inta(struct napi_struct *napi, int budget) for (i = 0; i < vdev->no_of_vpath; i++) { ring = &vdev->vpaths[i].ring; ring->budget = budget; + ring->pkts_processed = 0; vxge_hw_vpath_poll_rx(ring->handle); pkts_processed += ring->pkts_processed; budget -= ring->pkts_processed; @@ -2054,6 +2092,7 @@ static int vxge_open_vpaths(struct vxgedev *vdev) netdev_get_tx_queue(vdev->ndev, 0); vpath->fifo.indicate_max_pkts = vdev->config.fifo_indicate_max_pkts; + vpath->fifo.tx_vector_no = 0; vpath->ring.rx_vector_no = 0; vpath->ring.rx_csum = vdev->rx_csum; vpath->ring.rx_hwts = vdev->rx_hwts; @@ -2079,6 +2118,61 @@ static int vxge_open_vpaths(struct vxgedev *vdev) return VXGE_HW_OK; } +/** + * adaptive_coalesce_tx_interrupts - Changes the interrupt coalescing + * if the interrupts are not within a range + * @fifo: pointer to transmit fifo structure + * Description: The function changes boundary timer and restriction timer + * value depends on the traffic + * Return Value: None + */ +static void adaptive_coalesce_tx_interrupts(struct vxge_fifo *fifo) +{ + fifo->interrupt_count++; + if (jiffies > fifo->jiffies + HZ / 100) { + struct __vxge_hw_fifo *hw_fifo = fifo->handle; + + fifo->jiffies = jiffies; + if (fifo->interrupt_count > VXGE_T1A_MAX_TX_INTERRUPT_COUNT && + hw_fifo->rtimer != VXGE_TTI_RTIMER_ADAPT_VAL) { + hw_fifo->rtimer = VXGE_TTI_RTIMER_ADAPT_VAL; + vxge_hw_vpath_dynamic_tti_rtimer_set(hw_fifo); + } else if (hw_fifo->rtimer != 0) { + hw_fifo->rtimer = 0; + vxge_hw_vpath_dynamic_tti_rtimer_set(hw_fifo); + } + fifo->interrupt_count = 0; + } +} + +/** + * adaptive_coalesce_rx_interrupts - Changes the interrupt coalescing + * if the interrupts are not within a range + * @ring: pointer to receive ring structure + * Description: The function increases of decreases the packet counts within + * the ranges of traffic utilization, if the interrupts due to this ring are + * not within a fixed range. + * Return Value: Nothing + */ +static void adaptive_coalesce_rx_interrupts(struct vxge_ring *ring) +{ + ring->interrupt_count++; + if (jiffies > ring->jiffies + HZ / 100) { + struct __vxge_hw_ring *hw_ring = ring->handle; + + ring->jiffies = jiffies; + if (ring->interrupt_count > VXGE_T1A_MAX_INTERRUPT_COUNT && + hw_ring->rtimer != VXGE_RTI_RTIMER_ADAPT_VAL) { + hw_ring->rtimer = VXGE_RTI_RTIMER_ADAPT_VAL; + vxge_hw_vpath_dynamic_rti_rtimer_set(hw_ring); + } else if (hw_ring->rtimer != 0) { + hw_ring->rtimer = 0; + vxge_hw_vpath_dynamic_rti_rtimer_set(hw_ring); + } + ring->interrupt_count = 0; + } +} + /* * vxge_isr_napi * @irq: the irq of the device. @@ -2139,24 +2233,39 @@ static irqreturn_t vxge_isr_napi(int irq, void *dev_id) #ifdef CONFIG_PCI_MSI -static irqreturn_t -vxge_tx_msix_handle(int irq, void *dev_id) +static irqreturn_t vxge_tx_msix_handle(int irq, void *dev_id) { struct vxge_fifo *fifo = (struct vxge_fifo *)dev_id; + adaptive_coalesce_tx_interrupts(fifo); + + vxge_hw_channel_msix_mask((struct __vxge_hw_channel *)fifo->handle, + fifo->tx_vector_no); + + vxge_hw_channel_msix_clear((struct __vxge_hw_channel *)fifo->handle, + fifo->tx_vector_no); + VXGE_COMPLETE_VPATH_TX(fifo); + vxge_hw_channel_msix_unmask((struct __vxge_hw_channel *)fifo->handle, + fifo->tx_vector_no); + + mmiowb(); + return IRQ_HANDLED; } -static irqreturn_t -vxge_rx_msix_napi_handle(int irq, void *dev_id) +static irqreturn_t vxge_rx_msix_napi_handle(int irq, void *dev_id) { struct vxge_ring *ring = (struct vxge_ring *)dev_id; - /* MSIX_IDX for Rx is 1 */ + adaptive_coalesce_rx_interrupts(ring); + vxge_hw_channel_msix_mask((struct __vxge_hw_channel *)ring->handle, - ring->rx_vector_no); + ring->rx_vector_no); + + vxge_hw_channel_msix_clear((struct __vxge_hw_channel *)ring->handle, + ring->rx_vector_no); napi_schedule(&ring->napi); return IRQ_HANDLED; @@ -2173,14 +2282,20 @@ vxge_alarm_msix_handle(int irq, void *dev_id) VXGE_HW_VPATH_MSIX_ACTIVE) + VXGE_ALARM_MSIX_ID; for (i = 0; i < vdev->no_of_vpath; i++) { + /* Reduce the chance of loosing alarm interrupts by masking + * the vector. A pending bit will be set if an alarm is + * generated and on unmask the interrupt will be fired. + */ vxge_hw_vpath_msix_mask(vdev->vpaths[i].handle, msix_id); + vxge_hw_vpath_msix_clear(vdev->vpaths[i].handle, msix_id); + mmiowb(); status = vxge_hw_vpath_alarm_process(vdev->vpaths[i].handle, vdev->exec_mode); if (status == VXGE_HW_OK) { - vxge_hw_vpath_msix_unmask(vdev->vpaths[i].handle, - msix_id); + msix_id); + mmiowb(); continue; } vxge_debug_intr(VXGE_ERR, @@ -2299,6 +2414,9 @@ static int vxge_enable_msix(struct vxgedev *vdev) vpath->ring.rx_vector_no = (vpath->device_id * VXGE_HW_VPATH_MSIX_ACTIVE) + 1; + vpath->fifo.tx_vector_no = (vpath->device_id * + VXGE_HW_VPATH_MSIX_ACTIVE); + vxge_hw_vpath_msix_set(vpath->handle, tim_msix_id, VXGE_ALARM_MSIX_ID); } @@ -2474,8 +2592,9 @@ INTA_MODE: "%s:vxge:INTA", vdev->ndev->name); vxge_hw_device_set_intr_type(vdev->devh, VXGE_HW_INTR_MODE_IRQLINE); - vxge_hw_vpath_tti_ci_set(vdev->devh, - vdev->vpaths[0].device_id); + + vxge_hw_vpath_tti_ci_set(vdev->vpaths[0].fifo.handle); + ret = request_irq((int) vdev->pdev->irq, vxge_isr_napi, IRQF_SHARED, vdev->desc[0], vdev); @@ -2745,6 +2864,10 @@ static int vxge_open(struct net_device *dev) } netif_tx_start_all_queues(vdev->ndev); + + /* configure CI */ + vxge_config_ci_for_tti_rti(vdev); + goto out0; out2: @@ -3348,7 +3471,7 @@ static int __devinit vxge_device_register(struct __vxge_hw_device *hldev, vxge_debug_init(VXGE_ERR, "%s: vpath memory allocation failed", vdev->ndev->name); - ret = -ENODEV; + ret = -ENOMEM; goto _out1; } @@ -3369,11 +3492,11 @@ static int __devinit vxge_device_register(struct __vxge_hw_device *hldev, if (vdev->config.gro_enable) ndev->features |= NETIF_F_GRO; - if (register_netdev(ndev)) { + ret = register_netdev(ndev); + if (ret) { vxge_debug_init(vxge_hw_device_trace_level_get(hldev), "%s: %s : device registration failed!", ndev->name, __func__); - ret = -ENODEV; goto _out2; } @@ -3444,6 +3567,11 @@ static void vxge_device_unregister(struct __vxge_hw_device *hldev) /* in 2.6 will call stop() if device is up */ unregister_netdev(dev); + kfree(vdev->vpaths); + + /* we are safe to free it now */ + free_netdev(dev); + vxge_debug_init(vdev->level_trace, "%s: ethernet device unregistered", buf); vxge_debug_entryexit(vdev->level_trace, "%s: %s:%d Exiting...", buf, @@ -3799,7 +3927,7 @@ static void __devinit vxge_device_config_init( break; case MSI_X: - device_config->intr_mode = VXGE_HW_INTR_MODE_MSIX; + device_config->intr_mode = VXGE_HW_INTR_MODE_MSIX_ONE_SHOT; break; } @@ -4335,10 +4463,10 @@ vxge_probe(struct pci_dev *pdev, const struct pci_device_id *pre) goto _exit1; } - if (pci_request_region(pdev, 0, VXGE_DRIVER_NAME)) { + ret = pci_request_region(pdev, 0, VXGE_DRIVER_NAME); + if (ret) { vxge_debug_init(VXGE_ERR, "%s : request regions failed", __func__); - ret = -ENODEV; goto _exit1; } @@ -4446,7 +4574,7 @@ vxge_probe(struct pci_dev *pdev, const struct pci_device_id *pre) if (!img[i].is_valid) break; vxge_debug_init(VXGE_TRACE, "%s: EPROM %d, version " - "%d.%d.%d.%d\n", VXGE_DRIVER_NAME, i, + "%d.%d.%d.%d", VXGE_DRIVER_NAME, i, VXGE_EPROM_IMG_MAJOR(img[i].version), VXGE_EPROM_IMG_MINOR(img[i].version), VXGE_EPROM_IMG_FIX(img[i].version), @@ -4643,8 +4771,9 @@ _exit6: _exit5: vxge_device_unregister(hldev); _exit4: - pci_disable_sriov(pdev); + pci_set_drvdata(pdev, NULL); vxge_hw_device_terminate(hldev); + pci_disable_sriov(pdev); _exit3: iounmap(attr.bar0); _exit2: @@ -4655,7 +4784,7 @@ _exit0: kfree(ll_config); kfree(device_config); driver_config->config_dev_cnt--; - pci_set_drvdata(pdev, NULL); + driver_config->total_dev_cnt--; return ret; } @@ -4668,45 +4797,34 @@ _exit0: static void __devexit vxge_remove(struct pci_dev *pdev) { struct __vxge_hw_device *hldev; - struct vxgedev *vdev = NULL; - struct net_device *dev; - int i = 0; + struct vxgedev *vdev; + int i; hldev = pci_get_drvdata(pdev); - if (hldev == NULL) return; - dev = hldev->ndev; - vdev = netdev_priv(dev); + vdev = netdev_priv(hldev->ndev); vxge_debug_entryexit(vdev->level_trace, "%s:%d", __func__, __LINE__); - vxge_debug_init(vdev->level_trace, "%s : removing PCI device...", __func__); - vxge_device_unregister(hldev); - for (i = 0; i < vdev->no_of_vpath; i++) { + for (i = 0; i < vdev->no_of_vpath; i++) vxge_free_mac_add_list(&vdev->vpaths[i]); - vdev->vpaths[i].mcast_addr_cnt = 0; - vdev->vpaths[i].mac_addr_cnt = 0; - } - - kfree(vdev->vpaths); + vxge_device_unregister(hldev); + pci_set_drvdata(pdev, NULL); + /* Do not call pci_disable_sriov here, as it will break child devices */ + vxge_hw_device_terminate(hldev); iounmap(vdev->bar0); - - /* we are safe to free it now */ - free_netdev(dev); + pci_release_region(pdev, 0); + pci_disable_device(pdev); + driver_config->config_dev_cnt--; + driver_config->total_dev_cnt--; vxge_debug_init(vdev->level_trace, "%s:%d Device unregistered", __func__, __LINE__); - - vxge_hw_device_terminate(hldev); - - pci_disable_device(pdev); - pci_release_region(pdev, 0); - pci_set_drvdata(pdev, NULL); vxge_debug_entryexit(vdev->level_trace, "%s:%d Exiting...", __func__, __LINE__); } diff --git a/drivers/net/vxge/vxge-main.h b/drivers/net/vxge/vxge-main.h index 5746fedc356..40474f0da57 100644 --- a/drivers/net/vxge/vxge-main.h +++ b/drivers/net/vxge/vxge-main.h @@ -59,11 +59,13 @@ #define VXGE_TTI_LTIMER_VAL 1000 #define VXGE_T1A_TTI_LTIMER_VAL 80 #define VXGE_TTI_RTIMER_VAL 0 +#define VXGE_TTI_RTIMER_ADAPT_VAL 10 #define VXGE_T1A_TTI_RTIMER_VAL 400 #define VXGE_RTI_BTIMER_VAL 250 #define VXGE_RTI_LTIMER_VAL 100 #define VXGE_RTI_RTIMER_VAL 0 -#define VXGE_FIFO_INDICATE_MAX_PKTS VXGE_DEF_FIFO_LENGTH +#define VXGE_RTI_RTIMER_ADAPT_VAL 15 +#define VXGE_FIFO_INDICATE_MAX_PKTS VXGE_DEF_FIFO_LENGTH #define VXGE_ISR_POLLING_CNT 8 #define VXGE_MAX_CONFIG_DEV 0xFF #define VXGE_EXEC_MODE_DISABLE 0 @@ -107,6 +109,14 @@ #define RTI_T1A_RX_UFC_C 50 #define RTI_T1A_RX_UFC_D 60 +/* + * The interrupt rate is maintained at 3k per second with the moderation + * parameters for most traffic but not all. This is the maximum interrupt + * count allowed per function with INTA or per vector in the case of + * MSI-X in a 10 millisecond time period. Enabled only for Titan 1A. + */ +#define VXGE_T1A_MAX_INTERRUPT_COUNT 100 +#define VXGE_T1A_MAX_TX_INTERRUPT_COUNT 200 /* Milli secs timer period */ #define VXGE_TIMER_DELAY 10000 @@ -247,6 +257,11 @@ struct vxge_fifo { int tx_steering_type; int indicate_max_pkts; + /* Adaptive interrupt moderation parameters used in T1A */ + unsigned long interrupt_count; + unsigned long jiffies; + + u32 tx_vector_no; /* Tx stats */ struct vxge_fifo_stats stats; } ____cacheline_aligned; @@ -271,6 +286,10 @@ struct vxge_ring { */ int driver_id; + /* Adaptive interrupt moderation parameters used in T1A */ + unsigned long interrupt_count; + unsigned long jiffies; + /* copy of the flag indicating whether rx_csum is to be used */ u32 rx_csum:1, rx_hwts:1; @@ -286,7 +305,7 @@ struct vxge_ring { int vlan_tag_strip; struct vlan_group *vlgrp; - int rx_vector_no; + u32 rx_vector_no; enum vxge_hw_status last_status; /* Rx stats */ diff --git a/drivers/net/vxge/vxge-traffic.c b/drivers/net/vxge/vxge-traffic.c index 4c10d6c4075..8674f331311 100644 --- a/drivers/net/vxge/vxge-traffic.c +++ b/drivers/net/vxge/vxge-traffic.c @@ -218,6 +218,68 @@ exit: return status; } +void vxge_hw_vpath_tti_ci_set(struct __vxge_hw_fifo *fifo) +{ + struct vxge_hw_vpath_reg __iomem *vp_reg; + struct vxge_hw_vp_config *config; + u64 val64; + + if (fifo->config->enable != VXGE_HW_FIFO_ENABLE) + return; + + vp_reg = fifo->vp_reg; + config = container_of(fifo->config, struct vxge_hw_vp_config, fifo); + + if (config->tti.timer_ci_en != VXGE_HW_TIM_TIMER_CI_ENABLE) { + config->tti.timer_ci_en = VXGE_HW_TIM_TIMER_CI_ENABLE; + val64 = readq(&vp_reg->tim_cfg1_int_num[VXGE_HW_VPATH_INTR_TX]); + val64 |= VXGE_HW_TIM_CFG1_INT_NUM_TIMER_CI; + fifo->tim_tti_cfg1_saved = val64; + writeq(val64, &vp_reg->tim_cfg1_int_num[VXGE_HW_VPATH_INTR_TX]); + } +} + +void vxge_hw_vpath_dynamic_rti_ci_set(struct __vxge_hw_ring *ring) +{ + u64 val64 = ring->tim_rti_cfg1_saved; + + val64 |= VXGE_HW_TIM_CFG1_INT_NUM_TIMER_CI; + ring->tim_rti_cfg1_saved = val64; + writeq(val64, &ring->vp_reg->tim_cfg1_int_num[VXGE_HW_VPATH_INTR_RX]); +} + +void vxge_hw_vpath_dynamic_tti_rtimer_set(struct __vxge_hw_fifo *fifo) +{ + u64 val64 = fifo->tim_tti_cfg3_saved; + u64 timer = (fifo->rtimer * 1000) / 272; + + val64 &= ~VXGE_HW_TIM_CFG3_INT_NUM_RTIMER_VAL(0x3ffffff); + if (timer) + val64 |= VXGE_HW_TIM_CFG3_INT_NUM_RTIMER_VAL(timer) | + VXGE_HW_TIM_CFG3_INT_NUM_RTIMER_EVENT_SF(5); + + writeq(val64, &fifo->vp_reg->tim_cfg3_int_num[VXGE_HW_VPATH_INTR_TX]); + /* tti_cfg3_saved is not updated again because it is + * initialized at one place only - init time. + */ +} + +void vxge_hw_vpath_dynamic_rti_rtimer_set(struct __vxge_hw_ring *ring) +{ + u64 val64 = ring->tim_rti_cfg3_saved; + u64 timer = (ring->rtimer * 1000) / 272; + + val64 &= ~VXGE_HW_TIM_CFG3_INT_NUM_RTIMER_VAL(0x3ffffff); + if (timer) + val64 |= VXGE_HW_TIM_CFG3_INT_NUM_RTIMER_VAL(timer) | + VXGE_HW_TIM_CFG3_INT_NUM_RTIMER_EVENT_SF(4); + + writeq(val64, &ring->vp_reg->tim_cfg3_int_num[VXGE_HW_VPATH_INTR_RX]); + /* rti_cfg3_saved is not updated again because it is + * initialized at one place only - init time. + */ +} + /** * vxge_hw_channel_msix_mask - Mask MSIX Vector. * @channeh: Channel for rx or tx handle @@ -254,6 +316,23 @@ vxge_hw_channel_msix_unmask(struct __vxge_hw_channel *channel, int msix_id) } /** + * vxge_hw_channel_msix_clear - Unmask the MSIX Vector. + * @channel: Channel for rx or tx handle + * @msix_id: MSI ID + * + * The function unmasks the msix interrupt for the given msix_id + * if configured in MSIX oneshot mode + * + * Returns: 0 + */ +void vxge_hw_channel_msix_clear(struct __vxge_hw_channel *channel, int msix_id) +{ + __vxge_hw_pio_mem_write32_upper( + (u32) vxge_bVALn(vxge_mBIT(msix_id >> 2), 0, 32), + &channel->common_reg->clr_msix_one_shot_vec[msix_id % 4]); +} + +/** * vxge_hw_device_set_intr_type - Updates the configuration * with new interrupt type. * @hldev: HW device handle. @@ -2191,19 +2270,14 @@ vxge_hw_vpath_msix_set(struct __vxge_hw_vpath_handle *vp, int *tim_msix_id, if (vpath->hldev->config.intr_mode == VXGE_HW_INTR_MODE_MSIX_ONE_SHOT) { __vxge_hw_pio_mem_write32_upper((u32)vxge_bVALn( + VXGE_HW_ONE_SHOT_VECT0_EN_ONE_SHOT_VECT0_EN, + 0, 32), &vp_reg->one_shot_vect0_en); + __vxge_hw_pio_mem_write32_upper((u32)vxge_bVALn( VXGE_HW_ONE_SHOT_VECT1_EN_ONE_SHOT_VECT1_EN, 0, 32), &vp_reg->one_shot_vect1_en); - } - - if (vpath->hldev->config.intr_mode == - VXGE_HW_INTR_MODE_MSIX_ONE_SHOT) { __vxge_hw_pio_mem_write32_upper((u32)vxge_bVALn( VXGE_HW_ONE_SHOT_VECT2_EN_ONE_SHOT_VECT2_EN, 0, 32), &vp_reg->one_shot_vect2_en); - - __vxge_hw_pio_mem_write32_upper((u32)vxge_bVALn( - VXGE_HW_ONE_SHOT_VECT3_EN_ONE_SHOT_VECT3_EN, - 0, 32), &vp_reg->one_shot_vect3_en); } } @@ -2229,6 +2303,32 @@ vxge_hw_vpath_msix_mask(struct __vxge_hw_vpath_handle *vp, int msix_id) } /** + * vxge_hw_vpath_msix_clear - Clear MSIX Vector. + * @vp: Virtual Path handle. + * @msix_id: MSI ID + * + * The function clears the msix interrupt for the given msix_id + * + * Returns: 0, + * Otherwise, VXGE_HW_ERR_WRONG_IRQ if the msix index is out of range + * status. + * See also: + */ +void vxge_hw_vpath_msix_clear(struct __vxge_hw_vpath_handle *vp, int msix_id) +{ + struct __vxge_hw_device *hldev = vp->vpath->hldev; + + if ((hldev->config.intr_mode == VXGE_HW_INTR_MODE_MSIX_ONE_SHOT)) + __vxge_hw_pio_mem_write32_upper( + (u32) vxge_bVALn(vxge_mBIT((msix_id >> 2)), 0, 32), + &hldev->common_reg->clr_msix_one_shot_vec[msix_id % 4]); + else + __vxge_hw_pio_mem_write32_upper( + (u32) vxge_bVALn(vxge_mBIT((msix_id >> 2)), 0, 32), + &hldev->common_reg->clear_msix_mask_vect[msix_id % 4]); +} + +/** * vxge_hw_vpath_msix_unmask - Unmask the MSIX Vector. * @vp: Virtual Path handle. * @msix_id: MSI ID diff --git a/drivers/net/vxge/vxge-traffic.h b/drivers/net/vxge/vxge-traffic.h index d48486d6afa..9d9dfda4c7a 100644 --- a/drivers/net/vxge/vxge-traffic.h +++ b/drivers/net/vxge/vxge-traffic.h @@ -2142,6 +2142,10 @@ void vxge_hw_device_clear_tx_rx( * Virtual Paths */ +void vxge_hw_vpath_dynamic_rti_rtimer_set(struct __vxge_hw_ring *ring); + +void vxge_hw_vpath_dynamic_tti_rtimer_set(struct __vxge_hw_fifo *fifo); + u32 vxge_hw_vpath_id( struct __vxge_hw_vpath_handle *vpath_handle); @@ -2245,6 +2249,8 @@ void vxge_hw_vpath_msix_mask(struct __vxge_hw_vpath_handle *vpath_handle, int msix_id); +void vxge_hw_vpath_msix_clear(struct __vxge_hw_vpath_handle *vp, int msix_id); + void vxge_hw_device_flush_io(struct __vxge_hw_device *devh); void @@ -2270,6 +2276,9 @@ void vxge_hw_channel_msix_unmask(struct __vxge_hw_channel *channelh, int msix_id); void +vxge_hw_channel_msix_clear(struct __vxge_hw_channel *channelh, int msix_id); + +void vxge_hw_channel_dtr_try_complete(struct __vxge_hw_channel *channel, void **dtrh); @@ -2282,7 +2291,8 @@ vxge_hw_channel_dtr_free(struct __vxge_hw_channel *channel, void *dtrh); int vxge_hw_channel_dtr_count(struct __vxge_hw_channel *channel); -void -vxge_hw_vpath_tti_ci_set(struct __vxge_hw_device *hldev, u32 vp_id); +void vxge_hw_vpath_tti_ci_set(struct __vxge_hw_fifo *fifo); + +void vxge_hw_vpath_dynamic_rti_ci_set(struct __vxge_hw_ring *ring); #endif diff --git a/drivers/net/vxge/vxge-version.h b/drivers/net/vxge/vxge-version.h index ad2f99b9bcf..581e21525e8 100644 --- a/drivers/net/vxge/vxge-version.h +++ b/drivers/net/vxge/vxge-version.h @@ -16,8 +16,8 @@ #define VXGE_VERSION_MAJOR "2" #define VXGE_VERSION_MINOR "5" -#define VXGE_VERSION_FIX "1" -#define VXGE_VERSION_BUILD "22082" +#define VXGE_VERSION_FIX "2" +#define VXGE_VERSION_BUILD "22259" #define VXGE_VERSION_FOR "k" #define VXGE_FW_VER(maj, min, bld) (((maj) << 16) + ((min) << 8) + (bld)) diff --git a/drivers/rapidio/rio-scan.c b/drivers/rapidio/rio-scan.c index 467e82bd092..a50391b6ba2 100644 --- a/drivers/rapidio/rio-scan.c +++ b/drivers/rapidio/rio-scan.c @@ -943,6 +943,8 @@ static int rio_enum_complete(struct rio_mport *port) * @port: Master port to send transactions * @destid: Current destination ID in network * @hopcount: Number of hops into the network + * @prev: previous rio_dev + * @prev_port: previous port number * * Recursively discovers a RIO network. Transactions are sent via the * master port passed in @port. diff --git a/drivers/rtc/Kconfig b/drivers/rtc/Kconfig index 4941cade319..cdd97192dc6 100644 --- a/drivers/rtc/Kconfig +++ b/drivers/rtc/Kconfig @@ -97,18 +97,6 @@ config RTC_INTF_DEV If unsure, say Y. -config RTC_INTF_DEV_UIE_EMUL - bool "RTC UIE emulation on dev interface" - depends on RTC_INTF_DEV - help - Provides an emulation for RTC_UIE if the underlying rtc chip - driver does not expose RTC_UIE ioctls. Those requests generate - once-per-second update interrupts, used for synchronization. - - The emulation code will read the time from the hardware - clock several times per second, please enable this option - only if you know that you really need it. - config RTC_DRV_TEST tristate "Test driver/device" help diff --git a/drivers/rtc/interface.c b/drivers/rtc/interface.c index 90384b9f6b2..925006d3310 100644 --- a/drivers/rtc/interface.c +++ b/drivers/rtc/interface.c @@ -16,6 +16,9 @@ #include <linux/log2.h> #include <linux/workqueue.h> +static int rtc_timer_enqueue(struct rtc_device *rtc, struct rtc_timer *timer); +static void rtc_timer_remove(struct rtc_device *rtc, struct rtc_timer *timer); + static int __rtc_read_time(struct rtc_device *rtc, struct rtc_time *tm) { int err; @@ -120,12 +123,18 @@ int rtc_read_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alarm) err = mutex_lock_interruptible(&rtc->ops_lock); if (err) return err; - alarm->enabled = rtc->aie_timer.enabled; - if (alarm->enabled) + if (rtc->ops == NULL) + err = -ENODEV; + else if (!rtc->ops->read_alarm) + err = -EINVAL; + else { + memset(alarm, 0, sizeof(struct rtc_wkalrm)); + alarm->enabled = rtc->aie_timer.enabled; alarm->time = rtc_ktime_to_tm(rtc->aie_timer.node.expires); + } mutex_unlock(&rtc->ops_lock); - return 0; + return err; } EXPORT_SYMBOL_GPL(rtc_read_alarm); @@ -175,16 +184,14 @@ int rtc_set_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alarm) return err; if (rtc->aie_timer.enabled) { rtc_timer_remove(rtc, &rtc->aie_timer); - rtc->aie_timer.enabled = 0; } rtc->aie_timer.node.expires = rtc_tm_to_ktime(alarm->time); rtc->aie_timer.period = ktime_set(0, 0); if (alarm->enabled) { - rtc->aie_timer.enabled = 1; - rtc_timer_enqueue(rtc, &rtc->aie_timer); + err = rtc_timer_enqueue(rtc, &rtc->aie_timer); } mutex_unlock(&rtc->ops_lock); - return 0; + return err; } EXPORT_SYMBOL_GPL(rtc_set_alarm); @@ -195,15 +202,15 @@ int rtc_alarm_irq_enable(struct rtc_device *rtc, unsigned int enabled) return err; if (rtc->aie_timer.enabled != enabled) { - if (enabled) { - rtc->aie_timer.enabled = 1; - rtc_timer_enqueue(rtc, &rtc->aie_timer); - } else { + if (enabled) + err = rtc_timer_enqueue(rtc, &rtc->aie_timer); + else rtc_timer_remove(rtc, &rtc->aie_timer); - rtc->aie_timer.enabled = 0; - } } + if (err) + return err; + if (!rtc->ops) err = -ENODEV; else if (!rtc->ops->alarm_irq_enable) @@ -235,12 +242,9 @@ int rtc_update_irq_enable(struct rtc_device *rtc, unsigned int enabled) now = rtc_tm_to_ktime(tm); rtc->uie_rtctimer.node.expires = ktime_add(now, onesec); rtc->uie_rtctimer.period = ktime_set(1, 0); - rtc->uie_rtctimer.enabled = 1; - rtc_timer_enqueue(rtc, &rtc->uie_rtctimer); - } else { + err = rtc_timer_enqueue(rtc, &rtc->uie_rtctimer); + } else rtc_timer_remove(rtc, &rtc->uie_rtctimer); - rtc->uie_rtctimer.enabled = 0; - } out: mutex_unlock(&rtc->ops_lock); @@ -488,10 +492,13 @@ EXPORT_SYMBOL_GPL(rtc_irq_set_freq); * Enqueues a timer onto the rtc devices timerqueue and sets * the next alarm event appropriately. * + * Sets the enabled bit on the added timer. + * * Must hold ops_lock for proper serialization of timerqueue */ -void rtc_timer_enqueue(struct rtc_device *rtc, struct rtc_timer *timer) +static int rtc_timer_enqueue(struct rtc_device *rtc, struct rtc_timer *timer) { + timer->enabled = 1; timerqueue_add(&rtc->timerqueue, &timer->node); if (&timer->node == timerqueue_getnext(&rtc->timerqueue)) { struct rtc_wkalrm alarm; @@ -501,7 +508,13 @@ void rtc_timer_enqueue(struct rtc_device *rtc, struct rtc_timer *timer) err = __rtc_set_alarm(rtc, &alarm); if (err == -ETIME) schedule_work(&rtc->irqwork); + else if (err) { + timerqueue_del(&rtc->timerqueue, &timer->node); + timer->enabled = 0; + return err; + } } + return 0; } /** @@ -512,13 +525,15 @@ void rtc_timer_enqueue(struct rtc_device *rtc, struct rtc_timer *timer) * Removes a timer onto the rtc devices timerqueue and sets * the next alarm event appropriately. * + * Clears the enabled bit on the removed timer. + * * Must hold ops_lock for proper serialization of timerqueue */ -void rtc_timer_remove(struct rtc_device *rtc, struct rtc_timer *timer) +static void rtc_timer_remove(struct rtc_device *rtc, struct rtc_timer *timer) { struct timerqueue_node *next = timerqueue_getnext(&rtc->timerqueue); timerqueue_del(&rtc->timerqueue, &timer->node); - + timer->enabled = 0; if (next == &timer->node) { struct rtc_wkalrm alarm; int err; @@ -626,8 +641,7 @@ int rtc_timer_start(struct rtc_device *rtc, struct rtc_timer* timer, timer->node.expires = expires; timer->period = period; - timer->enabled = 1; - rtc_timer_enqueue(rtc, timer); + ret = rtc_timer_enqueue(rtc, timer); mutex_unlock(&rtc->ops_lock); return ret; @@ -645,7 +659,6 @@ int rtc_timer_cancel(struct rtc_device *rtc, struct rtc_timer* timer) mutex_lock(&rtc->ops_lock); if (timer->enabled) rtc_timer_remove(rtc, timer); - timer->enabled = 0; mutex_unlock(&rtc->ops_lock); return ret; } diff --git a/fs/dcache.c b/fs/dcache.c index 9f493ee4dcb..2a6bd9a4ae9 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -176,6 +176,7 @@ static void d_free(struct dentry *dentry) /** * dentry_rcuwalk_barrier - invalidate in-progress rcu-walk lookups + * @dentry: the target dentry * After this call, in-progress rcu-walk path lookup will fail. This * should be called after unhashing, and after changing d_inode (if * the dentry has not already been unhashed). @@ -281,6 +282,7 @@ static void dentry_lru_move_tail(struct dentry *dentry) /** * d_kill - kill dentry and return parent * @dentry: dentry to kill + * @parent: parent dentry * * The dentry must already be unhashed and removed from the LRU. * @@ -1973,7 +1975,7 @@ out: /** * d_validate - verify dentry provided from insecure source (deprecated) * @dentry: The dentry alleged to be valid child of @dparent - * @parent: The parent dentry (known to be valid) + * @dparent: The parent dentry (known to be valid) * * An insecure source has sent us a dentry, here we verify it and dget() it. * This is used by ncpfs in its readdir implementation. diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 68649336c4a..6ebb81030d2 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -364,6 +364,13 @@ VMLINUX_SYMBOL(__start___param) = .; \ *(__param) \ VMLINUX_SYMBOL(__stop___param) = .; \ + } \ + \ + /* Built-in module versions. */ \ + __modver : AT(ADDR(__modver) - LOAD_OFFSET) { \ + VMLINUX_SYMBOL(__start___modver) = .; \ + *(__modver) \ + VMLINUX_SYMBOL(__stop___modver) = .; \ . = ALIGN((align)); \ VMLINUX_SYMBOL(__end_rodata) = .; \ } \ diff --git a/include/linux/audit.h b/include/linux/audit.h index 359df048769..9d339eb2788 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -103,6 +103,8 @@ #define AUDIT_BPRM_FCAPS 1321 /* Information about fcaps increasing perms */ #define AUDIT_CAPSET 1322 /* Record showing argument to sys_capset */ #define AUDIT_MMAP 1323 /* Record showing descriptor and flags in mmap */ +#define AUDIT_NETFILTER_PKT 1324 /* Packets traversing netfilter chains */ +#define AUDIT_NETFILTER_CFG 1325 /* Netfilter chain modifications */ #define AUDIT_AVC 1400 /* SE Linux avc denial or grant */ #define AUDIT_SELINUX_ERR 1401 /* Internal SE Linux Errors */ diff --git a/include/linux/dccp.h b/include/linux/dccp.h index 010e2d87ed7..d638e85dc50 100644 --- a/include/linux/dccp.h +++ b/include/linux/dccp.h @@ -279,8 +279,6 @@ enum dccp_state { DCCP_MAX_STATES }; -#define DCCP_STATE_MASK 0x1f - enum { DCCPF_OPEN = TCPF_ESTABLISHED, DCCPF_REQUESTING = TCPF_SYN_SENT, diff --git a/include/linux/gfp.h b/include/linux/gfp.h index a3b148a9187..0b84c61607e 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -249,7 +249,7 @@ static inline enum zone_type gfp_zone(gfp_t flags) ((1 << ZONES_SHIFT) - 1); if (__builtin_constant_p(bit)) - MAYBE_BUILD_BUG_ON((GFP_ZONE_BAD >> bit) & 1); + BUILD_BUG_ON((GFP_ZONE_BAD >> bit) & 1); else { #ifdef CONFIG_DEBUG_VM BUG_ON((GFP_ZONE_BAD >> bit) & 1); diff --git a/include/linux/if_link.h b/include/linux/if_link.h index 6485d2a89be..f4a2e6b1b86 100644 --- a/include/linux/if_link.h +++ b/include/linux/if_link.h @@ -135,6 +135,7 @@ enum { IFLA_VF_PORTS, IFLA_PORT_SELF, IFLA_AF_SPEC, + IFLA_GROUP, /* Group the device belongs to */ __IFLA_MAX }; diff --git a/include/linux/ip_vs.h b/include/linux/ip_vs.h index 5f43a3b2e3a..4deb3834d62 100644 --- a/include/linux/ip_vs.h +++ b/include/linux/ip_vs.h @@ -89,6 +89,14 @@ #define IP_VS_CONN_F_TEMPLATE 0x1000 /* template, not connection */ #define IP_VS_CONN_F_ONE_PACKET 0x2000 /* forward only one packet */ +#define IP_VS_CONN_F_BACKUP_MASK (IP_VS_CONN_F_FWD_MASK | \ + IP_VS_CONN_F_NOOUTPUT | \ + IP_VS_CONN_F_INACTIVE | \ + IP_VS_CONN_F_SEQ_MASK | \ + IP_VS_CONN_F_NO_CPORT | \ + IP_VS_CONN_F_TEMPLATE \ + ) + /* Flags that are not sent to backup server start from bit 16 */ #define IP_VS_CONN_F_NFCT (1 << 16) /* use netfilter conntrack */ diff --git a/include/linux/kernel.h b/include/linux/kernel.h index d07d8057e44..e2f4d6af212 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -575,12 +575,6 @@ struct sysinfo { char _f[20-2*sizeof(long)-sizeof(int)]; /* Padding: libc5 uses this.. */ }; -/* Force a compilation error if condition is true */ -#define BUILD_BUG_ON(condition) ((void)BUILD_BUG_ON_ZERO(condition)) - -/* Force a compilation error if condition is constant and true */ -#define MAYBE_BUILD_BUG_ON(cond) ((void)sizeof(char[1 - 2 * !!(cond)])) - /* Force a compilation error if a constant expression is not a power of 2 */ #define BUILD_BUG_ON_NOT_POWER_OF_2(n) \ BUILD_BUG_ON((n) == 0 || (((n) & ((n) - 1)) != 0)) @@ -592,6 +586,32 @@ struct sysinfo { #define BUILD_BUG_ON_ZERO(e) (sizeof(struct { int:-!!(e); })) #define BUILD_BUG_ON_NULL(e) ((void *)sizeof(struct { int:-!!(e); })) +/** + * BUILD_BUG_ON - break compile if a condition is true. + * @cond: the condition which the compiler should know is false. + * + * If you have some code which relies on certain constants being equal, or + * other compile-time-evaluated condition, you should use BUILD_BUG_ON to + * detect if someone changes it. + * + * The implementation uses gcc's reluctance to create a negative array, but + * gcc (as of 4.4) only emits that error for obvious cases (eg. not arguments + * to inline functions). So as a fallback we use the optimizer; if it can't + * prove the condition is false, it will cause a link error on the undefined + * "__build_bug_on_failed". This error message can be harder to track down + * though, hence the two different methods. + */ +#ifndef __OPTIMIZE__ +#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)])) +#else +extern int __build_bug_on_failed; +#define BUILD_BUG_ON(condition) \ + do { \ + ((void)sizeof(char[1 - 2*!!(condition)])); \ + if (condition) __build_bug_on_failed = 1; \ + } while(0) +#endif + /* Trap pasters of __FUNCTION__ at compile-time */ #define __FUNCTION__ (__func__) diff --git a/include/linux/kmemcheck.h b/include/linux/kmemcheck.h index 08d7dc4ddf4..39f8453239f 100644 --- a/include/linux/kmemcheck.h +++ b/include/linux/kmemcheck.h @@ -76,7 +76,7 @@ bool kmemcheck_is_obj_initialized(unsigned long addr, size_t size); \ _n = (long) &((ptr)->name##_end) \ - (long) &((ptr)->name##_begin); \ - MAYBE_BUILD_BUG_ON(_n < 0); \ + BUILD_BUG_ON(_n < 0); \ \ kmemcheck_mark_initialized(&((ptr)->name##_begin), _n); \ } while (0) diff --git a/include/linux/module.h b/include/linux/module.h index 8b17fd8c790..e7c6385c668 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -58,6 +58,12 @@ struct module_attribute { void (*free)(struct module *); }; +struct module_version_attribute { + struct module_attribute mattr; + const char *module_name; + const char *version; +}; + struct module_kobject { struct kobject kobj; @@ -161,7 +167,28 @@ extern struct module __this_module; Using this automatically adds a checksum of the .c files and the local headers in "srcversion". */ + +#if defined(MODULE) || !defined(CONFIG_SYSFS) #define MODULE_VERSION(_version) MODULE_INFO(version, _version) +#else +#define MODULE_VERSION(_version) \ + extern ssize_t __modver_version_show(struct module_attribute *, \ + struct module *, char *); \ + static struct module_version_attribute __modver_version_attr \ + __used \ + __attribute__ ((__section__ ("__modver"),aligned(sizeof(void *)))) \ + = { \ + .mattr = { \ + .attr = { \ + .name = "version", \ + .mode = S_IRUGO, \ + }, \ + .show = __modver_version_show, \ + }, \ + .module_name = KBUILD_MODNAME, \ + .version = _version, \ + } +#endif /* Optional firmware file (or files) needed by the module * format is simply firmware file name. Multiple firmware diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h index 112adf8bd47..07b41951e3f 100644 --- a/include/linux/moduleparam.h +++ b/include/linux/moduleparam.h @@ -16,15 +16,17 @@ /* Chosen so that structs with an unsigned long line up. */ #define MAX_PARAM_PREFIX_LEN (64 - sizeof(unsigned long)) -#ifdef MODULE #define ___module_cat(a,b) __mod_ ## a ## b #define __module_cat(a,b) ___module_cat(a,b) +#ifdef MODULE #define __MODULE_INFO(tag, name, info) \ static const char __module_cat(name,__LINE__)[] \ __used __attribute__((section(".modinfo"), unused, aligned(1))) \ = __stringify(tag) "=" info #else /* !MODULE */ -#define __MODULE_INFO(tag, name, info) +/* This struct is here for syntactic coherency, it is not used */ +#define __MODULE_INFO(tag, name, info) \ + struct __module_cat(name,__LINE__) {} #endif #define __MODULE_PARM_TYPE(name, _type) \ __MODULE_INFO(parmtype, name##type, #name ":" _type) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d971346b034..371fa8839d5 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -75,6 +75,9 @@ struct wireless_dev; #define NET_RX_SUCCESS 0 /* keep 'em coming, baby */ #define NET_RX_DROP 1 /* packet dropped */ +/* Initial net device group. All devices belong to group 0 by default. */ +#define INIT_NETDEV_GROUP 0 + /* * Transmit return codes: transmit return codes originate from three different * namespaces: @@ -643,6 +646,14 @@ struct xps_dev_maps { (nr_cpu_ids * sizeof(struct xps_map *))) #endif /* CONFIG_XPS */ +#define TC_MAX_QUEUE 16 +#define TC_BITMASK 15 +/* HW offloaded queuing disciplines txq count and offset maps */ +struct netdev_tc_txq { + u16 count; + u16 offset; +}; + /* * This structure defines the management hooks for network devices. * The following hooks can be defined; unless noted otherwise, they are @@ -753,6 +764,11 @@ struct xps_dev_maps { * int (*ndo_set_vf_port)(struct net_device *dev, int vf, * struct nlattr *port[]); * int (*ndo_get_vf_port)(struct net_device *dev, int vf, struct sk_buff *skb); + * int (*ndo_setup_tc)(struct net_device *dev, u8 tc) + * Called to setup 'tc' number of traffic classes in the net device. This + * is always called from the stack with the rtnl lock held and netif tx + * queues stopped. This allows the netdevice to perform queue management + * safely. */ #define HAVE_NET_DEVICE_OPS struct net_device_ops { @@ -811,6 +827,7 @@ struct net_device_ops { struct nlattr *port[]); int (*ndo_get_vf_port)(struct net_device *dev, int vf, struct sk_buff *skb); + int (*ndo_setup_tc)(struct net_device *dev, u8 tc); #if defined(CONFIG_FCOE) || defined(CONFIG_FCOE_MODULE) int (*ndo_fcoe_enable)(struct net_device *dev); int (*ndo_fcoe_disable)(struct net_device *dev); @@ -1143,6 +1160,9 @@ struct net_device { /* Data Center Bridging netlink ops */ const struct dcbnl_rtnl_ops *dcbnl_ops; #endif + u8 num_tc; + struct netdev_tc_txq tc_to_txq[TC_MAX_QUEUE]; + u8 prio_tc_map[TC_BITMASK + 1]; #if defined(CONFIG_FCOE) || defined(CONFIG_FCOE_MODULE) /* max exchange id for FCoE LRO by ddp */ @@ -1153,12 +1173,66 @@ struct net_device { /* phy device may attach itself for hardware timestamping */ struct phy_device *phydev; + + /* group the device belongs to */ + int group; }; #define to_net_dev(d) container_of(d, struct net_device, dev) #define NETDEV_ALIGN 32 static inline +int netdev_get_prio_tc_map(const struct net_device *dev, u32 prio) +{ + return dev->prio_tc_map[prio & TC_BITMASK]; +} + +static inline +int netdev_set_prio_tc_map(struct net_device *dev, u8 prio, u8 tc) +{ + if (tc >= dev->num_tc) + return -EINVAL; + + dev->prio_tc_map[prio & TC_BITMASK] = tc & TC_BITMASK; + return 0; +} + +static inline +void netdev_reset_tc(struct net_device *dev) +{ + dev->num_tc = 0; + memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq)); + memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map)); +} + +static inline +int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset) +{ + if (tc >= dev->num_tc) + return -EINVAL; + + dev->tc_to_txq[tc].count = count; + dev->tc_to_txq[tc].offset = offset; + return 0; +} + +static inline +int netdev_set_num_tc(struct net_device *dev, u8 num_tc) +{ + if (num_tc > TC_MAX_QUEUE) + return -EINVAL; + + dev->num_tc = num_tc; + return 0; +} + +static inline +int netdev_get_num_tc(struct net_device *dev) +{ + return dev->num_tc; +} + +static inline struct netdev_queue *netdev_get_tx_queue(const struct net_device *dev, unsigned int index) { @@ -1844,6 +1918,7 @@ extern int dev_set_alias(struct net_device *, const char *, size_t); extern int dev_change_net_namespace(struct net_device *, struct net *, const char *); extern int dev_set_mtu(struct net_device *, int); +extern void dev_set_group(struct net_device *, int); extern int dev_set_mac_address(struct net_device *, struct sockaddr *); extern int dev_hard_start_xmit(struct sk_buff *skb, diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index 1893837b396..eeec00abb66 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -24,16 +24,20 @@ #define NF_MAX_VERDICT NF_STOP /* we overload the higher bits for encoding auxiliary data such as the queue - * number. Not nice, but better than additional function arguments. */ -#define NF_VERDICT_MASK 0x0000ffff -#define NF_VERDICT_BITS 16 + * number or errno values. Not nice, but better than additional function + * arguments. */ +#define NF_VERDICT_MASK 0x000000ff + +/* extra verdict flags have mask 0x0000ff00 */ +#define NF_VERDICT_FLAG_QUEUE_BYPASS 0x00008000 +/* queue number (NF_QUEUE) or errno (NF_DROP) */ #define NF_VERDICT_QMASK 0xffff0000 #define NF_VERDICT_QBITS 16 -#define NF_QUEUE_NR(x) ((((x) << NF_VERDICT_BITS) & NF_VERDICT_QMASK) | NF_QUEUE) +#define NF_QUEUE_NR(x) ((((x) << 16) & NF_VERDICT_QMASK) | NF_QUEUE) -#define NF_DROP_ERR(x) (((-x) << NF_VERDICT_BITS) | NF_DROP) +#define NF_DROP_ERR(x) (((-x) << 16) | NF_DROP) /* only for userspace compatibility */ #ifndef __KERNEL__ @@ -41,6 +45,9 @@ <= 0x2000 is used for protocol-flags. */ #define NFC_UNKNOWN 0x4000 #define NFC_ALTERED 0x8000 + +/* NF_VERDICT_BITS should be 8 now, but userspace might break if this changes */ +#define NF_VERDICT_BITS 16 #endif enum nf_inet_hooks { @@ -72,6 +79,10 @@ union nf_inet_addr { #ifdef __KERNEL__ #ifdef CONFIG_NETFILTER +static inline int NF_DROP_GETERR(int verdict) +{ + return -(verdict >> NF_VERDICT_QBITS); +} static inline int nf_inet_addr_cmp(const union nf_inet_addr *a1, const union nf_inet_addr *a2) @@ -267,7 +278,7 @@ struct nf_afinfo { int route_key_size; }; -extern const struct nf_afinfo *nf_afinfo[NFPROTO_NUMPROTO]; +extern const struct nf_afinfo __rcu *nf_afinfo[NFPROTO_NUMPROTO]; static inline const struct nf_afinfo *nf_get_afinfo(unsigned short family) { return rcu_dereference(nf_afinfo[family]); @@ -357,9 +368,9 @@ nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl, u_int8_t family) #endif /*CONFIG_NETFILTER*/ #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) -extern void (*ip_ct_attach)(struct sk_buff *, struct sk_buff *); +extern void (*ip_ct_attach)(struct sk_buff *, struct sk_buff *) __rcu; extern void nf_ct_attach(struct sk_buff *, struct sk_buff *); -extern void (*nf_ct_destroy)(struct nf_conntrack *); +extern void (*nf_ct_destroy)(struct nf_conntrack *) __rcu; #else static inline void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb) {} #endif diff --git a/include/linux/netfilter/Kbuild b/include/linux/netfilter/Kbuild index 9d40effe7ca..89c0d1e20d7 100644 --- a/include/linux/netfilter/Kbuild +++ b/include/linux/netfilter/Kbuild @@ -9,6 +9,7 @@ header-y += nfnetlink_conntrack.h header-y += nfnetlink_log.h header-y += nfnetlink_queue.h header-y += x_tables.h +header-y += xt_AUDIT.h header-y += xt_CHECKSUM.h header-y += xt_CLASSIFY.h header-y += xt_CONNMARK.h @@ -55,6 +56,7 @@ header-y += xt_rateest.h header-y += xt_realm.h header-y += xt_recent.h header-y += xt_sctp.h +header-y += xt_socket.h header-y += xt_state.h header-y += xt_statistic.h header-y += xt_string.h diff --git a/include/linux/netfilter/nf_conntrack_snmp.h b/include/linux/netfilter/nf_conntrack_snmp.h new file mode 100644 index 00000000000..064bc63a534 --- /dev/null +++ b/include/linux/netfilter/nf_conntrack_snmp.h @@ -0,0 +1,9 @@ +#ifndef _NF_CONNTRACK_SNMP_H +#define _NF_CONNTRACK_SNMP_H + +extern int (*nf_nat_snmp_hook)(struct sk_buff *skb, + unsigned int protoff, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo); + +#endif /* _NF_CONNTRACK_SNMP_H */ diff --git a/include/linux/netfilter/nfnetlink_conntrack.h b/include/linux/netfilter/nfnetlink_conntrack.h index 19711e3ffd4..debf1aefd75 100644 --- a/include/linux/netfilter/nfnetlink_conntrack.h +++ b/include/linux/netfilter/nfnetlink_conntrack.h @@ -42,6 +42,7 @@ enum ctattr_type { CTA_SECMARK, /* obsolete */ CTA_ZONE, CTA_SECCTX, + CTA_TIMESTAMP, __CTA_MAX }; #define CTA_MAX (__CTA_MAX - 1) @@ -127,6 +128,14 @@ enum ctattr_counters { }; #define CTA_COUNTERS_MAX (__CTA_COUNTERS_MAX - 1) +enum ctattr_tstamp { + CTA_TIMESTAMP_UNSPEC, + CTA_TIMESTAMP_START, + CTA_TIMESTAMP_STOP, + __CTA_TIMESTAMP_MAX +}; +#define CTA_TIMESTAMP_MAX (__CTA_TIMESTAMP_MAX - 1) + enum ctattr_nat { CTA_NAT_UNSPEC, CTA_NAT_MINIP, diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index 6712e713b29..37219525ff6 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -611,8 +611,9 @@ struct _compat_xt_align { extern void xt_compat_lock(u_int8_t af); extern void xt_compat_unlock(u_int8_t af); -extern int xt_compat_add_offset(u_int8_t af, unsigned int offset, short delta); +extern int xt_compat_add_offset(u_int8_t af, unsigned int offset, int delta); extern void xt_compat_flush_offsets(u_int8_t af); +extern void xt_compat_init_offsets(u_int8_t af, unsigned int number); extern int xt_compat_calc_jump(u_int8_t af, unsigned int offset); extern int xt_compat_match_offset(const struct xt_match *match); diff --git a/include/linux/netfilter/xt_AUDIT.h b/include/linux/netfilter/xt_AUDIT.h new file mode 100644 index 00000000000..38751d2ea52 --- /dev/null +++ b/include/linux/netfilter/xt_AUDIT.h @@ -0,0 +1,30 @@ +/* + * Header file for iptables xt_AUDIT target + * + * (C) 2010-2011 Thomas Graf <tgraf@redhat.com> + * (C) 2010-2011 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef _XT_AUDIT_TARGET_H +#define _XT_AUDIT_TARGET_H + +#include <linux/types.h> + +enum { + XT_AUDIT_TYPE_ACCEPT = 0, + XT_AUDIT_TYPE_DROP, + XT_AUDIT_TYPE_REJECT, + __XT_AUDIT_TYPE_MAX, +}; + +#define XT_AUDIT_TYPE_MAX (__XT_AUDIT_TYPE_MAX - 1) + +struct xt_audit_info { + __u8 type; /* XT_AUDIT_TYPE_* */ +}; + +#endif /* _XT_AUDIT_TARGET_H */ diff --git a/include/linux/netfilter/xt_CT.h b/include/linux/netfilter/xt_CT.h index 1b564106891..b56e76811c0 100644 --- a/include/linux/netfilter/xt_CT.h +++ b/include/linux/netfilter/xt_CT.h @@ -1,14 +1,16 @@ #ifndef _XT_CT_H #define _XT_CT_H +#include <linux/types.h> + #define XT_CT_NOTRACK 0x1 struct xt_ct_target_info { - u_int16_t flags; - u_int16_t zone; - u_int32_t ct_events; - u_int32_t exp_events; - char helper[16]; + __u16 flags; + __u16 zone; + __u32 ct_events; + __u32 exp_events; + char helper[16]; /* Used internally by the kernel */ struct nf_conn *ct __attribute__((aligned(8))); diff --git a/include/linux/netfilter/xt_NFQUEUE.h b/include/linux/netfilter/xt_NFQUEUE.h index 2584f4a777d..9eafdbbb401 100644 --- a/include/linux/netfilter/xt_NFQUEUE.h +++ b/include/linux/netfilter/xt_NFQUEUE.h @@ -20,4 +20,10 @@ struct xt_NFQ_info_v1 { __u16 queues_total; }; +struct xt_NFQ_info_v2 { + __u16 queuenum; + __u16 queues_total; + __u16 bypass; +}; + #endif /* _XT_NFQ_TARGET_H */ diff --git a/include/linux/netfilter/xt_TCPOPTSTRIP.h b/include/linux/netfilter/xt_TCPOPTSTRIP.h index 2db543214ff..7157318499c 100644 --- a/include/linux/netfilter/xt_TCPOPTSTRIP.h +++ b/include/linux/netfilter/xt_TCPOPTSTRIP.h @@ -1,13 +1,15 @@ #ifndef _XT_TCPOPTSTRIP_H #define _XT_TCPOPTSTRIP_H +#include <linux/types.h> + #define tcpoptstrip_set_bit(bmap, idx) \ (bmap[(idx) >> 5] |= 1U << (idx & 31)) #define tcpoptstrip_test_bit(bmap, idx) \ (((1U << (idx & 31)) & bmap[(idx) >> 5]) != 0) struct xt_tcpoptstrip_target_info { - u_int32_t strip_bmap[8]; + __u32 strip_bmap[8]; }; #endif /* _XT_TCPOPTSTRIP_H */ diff --git a/include/linux/netfilter/xt_TPROXY.h b/include/linux/netfilter/xt_TPROXY.h index 3f3d6936128..902043c2073 100644 --- a/include/linux/netfilter/xt_TPROXY.h +++ b/include/linux/netfilter/xt_TPROXY.h @@ -1,19 +1,21 @@ #ifndef _XT_TPROXY_H #define _XT_TPROXY_H +#include <linux/types.h> + /* TPROXY target is capable of marking the packet to perform * redirection. We can get rid of that whenever we get support for * mutliple targets in the same rule. */ struct xt_tproxy_target_info { - u_int32_t mark_mask; - u_int32_t mark_value; + __u32 mark_mask; + __u32 mark_value; __be32 laddr; __be16 lport; }; struct xt_tproxy_target_info_v1 { - u_int32_t mark_mask; - u_int32_t mark_value; + __u32 mark_mask; + __u32 mark_value; union nf_inet_addr laddr; __be16 lport; }; diff --git a/include/linux/netfilter/xt_cluster.h b/include/linux/netfilter/xt_cluster.h index 886682656f0..9b883c8fbf5 100644 --- a/include/linux/netfilter/xt_cluster.h +++ b/include/linux/netfilter/xt_cluster.h @@ -1,15 +1,17 @@ #ifndef _XT_CLUSTER_MATCH_H #define _XT_CLUSTER_MATCH_H +#include <linux/types.h> + enum xt_cluster_flags { XT_CLUSTER_F_INV = (1 << 0) }; struct xt_cluster_match_info { - u_int32_t total_nodes; - u_int32_t node_mask; - u_int32_t hash_seed; - u_int32_t flags; + __u32 total_nodes; + __u32 node_mask; + __u32 hash_seed; + __u32 flags; }; #define XT_CLUSTER_NODES_MAX 32 diff --git a/include/linux/netfilter/xt_comment.h b/include/linux/netfilter/xt_comment.h index eacfedc6b5d..0ea5e79f5bd 100644 --- a/include/linux/netfilter/xt_comment.h +++ b/include/linux/netfilter/xt_comment.h @@ -4,7 +4,7 @@ #define XT_MAX_COMMENT_LEN 256 struct xt_comment_info { - unsigned char comment[XT_MAX_COMMENT_LEN]; + char comment[XT_MAX_COMMENT_LEN]; }; #endif /* XT_COMMENT_H */ diff --git a/include/linux/netfilter/xt_connlimit.h b/include/linux/netfilter/xt_connlimit.h index 7e3284bcbd2..0ca66e97acb 100644 --- a/include/linux/netfilter/xt_connlimit.h +++ b/include/linux/netfilter/xt_connlimit.h @@ -1,8 +1,15 @@ #ifndef _XT_CONNLIMIT_H #define _XT_CONNLIMIT_H +#include <linux/types.h> + struct xt_connlimit_data; +enum { + XT_CONNLIMIT_INVERT = 1 << 0, + XT_CONNLIMIT_DADDR = 1 << 1, +}; + struct xt_connlimit_info { union { union nf_inet_addr mask; @@ -13,7 +20,14 @@ struct xt_connlimit_info { }; #endif }; - unsigned int limit, inverse; + unsigned int limit; + union { + /* revision 0 */ + unsigned int inverse; + + /* revision 1 */ + __u32 flags; + }; /* Used internally by the kernel */ struct xt_connlimit_data *data __attribute__((aligned(8))); diff --git a/include/linux/netfilter/xt_conntrack.h b/include/linux/netfilter/xt_conntrack.h index 54f47a2f615..74b904d8f99 100644 --- a/include/linux/netfilter/xt_conntrack.h +++ b/include/linux/netfilter/xt_conntrack.h @@ -58,4 +58,19 @@ struct xt_conntrack_mtinfo2 { __u16 state_mask, status_mask; }; +struct xt_conntrack_mtinfo3 { + union nf_inet_addr origsrc_addr, origsrc_mask; + union nf_inet_addr origdst_addr, origdst_mask; + union nf_inet_addr replsrc_addr, replsrc_mask; + union nf_inet_addr repldst_addr, repldst_mask; + __u32 expires_min, expires_max; + __u16 l4proto; + __u16 origsrc_port, origdst_port; + __u16 replsrc_port, repldst_port; + __u16 match_flags, invert_flags; + __u16 state_mask, status_mask; + __u16 origsrc_port_high, origdst_port_high; + __u16 replsrc_port_high, repldst_port_high; +}; + #endif /*_XT_CONNTRACK_H*/ diff --git a/include/linux/netfilter/xt_quota.h b/include/linux/netfilter/xt_quota.h index b0d28c659ab..ca6e03e47a1 100644 --- a/include/linux/netfilter/xt_quota.h +++ b/include/linux/netfilter/xt_quota.h @@ -1,6 +1,8 @@ #ifndef _XT_QUOTA_H #define _XT_QUOTA_H +#include <linux/types.h> + enum xt_quota_flags { XT_QUOTA_INVERT = 0x1, }; @@ -9,9 +11,9 @@ enum xt_quota_flags { struct xt_quota_priv; struct xt_quota_info { - u_int32_t flags; - u_int32_t pad; - aligned_u64 quota; + __u32 flags; + __u32 pad; + aligned_u64 quota; /* Used internally by the kernel */ struct xt_quota_priv *master; diff --git a/include/linux/netfilter/xt_socket.h b/include/linux/netfilter/xt_socket.h index 6f475b8ff34..26d7217bd4f 100644 --- a/include/linux/netfilter/xt_socket.h +++ b/include/linux/netfilter/xt_socket.h @@ -1,6 +1,8 @@ #ifndef _XT_SOCKET_H #define _XT_SOCKET_H +#include <linux/types.h> + enum { XT_SOCKET_TRANSPARENT = 1 << 0, }; diff --git a/include/linux/netfilter/xt_time.h b/include/linux/netfilter/xt_time.h index 14b6df412c9..7c37fac576c 100644 --- a/include/linux/netfilter/xt_time.h +++ b/include/linux/netfilter/xt_time.h @@ -1,14 +1,16 @@ #ifndef _XT_TIME_H #define _XT_TIME_H 1 +#include <linux/types.h> + struct xt_time_info { - u_int32_t date_start; - u_int32_t date_stop; - u_int32_t daytime_start; - u_int32_t daytime_stop; - u_int32_t monthdays_match; - u_int8_t weekdays_match; - u_int8_t flags; + __u32 date_start; + __u32 date_stop; + __u32 daytime_start; + __u32 daytime_stop; + __u32 monthdays_match; + __u8 weekdays_match; + __u8 flags; }; enum { diff --git a/include/linux/netfilter/xt_u32.h b/include/linux/netfilter/xt_u32.h index 9947f56cdbd..04d1bfea03c 100644 --- a/include/linux/netfilter/xt_u32.h +++ b/include/linux/netfilter/xt_u32.h @@ -1,6 +1,8 @@ #ifndef _XT_U32_H #define _XT_U32_H 1 +#include <linux/types.h> + enum xt_u32_ops { XT_U32_AND, XT_U32_LEFTSH, @@ -9,13 +11,13 @@ enum xt_u32_ops { }; struct xt_u32_location_element { - u_int32_t number; - u_int8_t nextop; + __u32 number; + __u8 nextop; }; struct xt_u32_value_element { - u_int32_t min; - u_int32_t max; + __u32 min; + __u32 max; }; /* @@ -27,14 +29,14 @@ struct xt_u32_value_element { struct xt_u32_test { struct xt_u32_location_element location[XT_U32_MAXSIZE+1]; struct xt_u32_value_element value[XT_U32_MAXSIZE+1]; - u_int8_t nnums; - u_int8_t nvalues; + __u8 nnums; + __u8 nvalues; }; struct xt_u32 { struct xt_u32_test tests[XT_U32_MAXSIZE+1]; - u_int8_t ntests; - u_int8_t invert; + __u8 ntests; + __u8 invert; }; #endif /* _XT_U32_H */ diff --git a/include/linux/netfilter_bridge/ebt_802_3.h b/include/linux/netfilter_bridge/ebt_802_3.h index c73ef0b18bd..be5be1577a5 100644 --- a/include/linux/netfilter_bridge/ebt_802_3.h +++ b/include/linux/netfilter_bridge/ebt_802_3.h @@ -1,6 +1,8 @@ #ifndef __LINUX_BRIDGE_EBT_802_3_H #define __LINUX_BRIDGE_EBT_802_3_H +#include <linux/types.h> + #define EBT_802_3_SAP 0x01 #define EBT_802_3_TYPE 0x02 @@ -24,24 +26,24 @@ /* ui has one byte ctrl, ni has two */ struct hdr_ui { - uint8_t dsap; - uint8_t ssap; - uint8_t ctrl; - uint8_t orig[3]; + __u8 dsap; + __u8 ssap; + __u8 ctrl; + __u8 orig[3]; __be16 type; }; struct hdr_ni { - uint8_t dsap; - uint8_t ssap; + __u8 dsap; + __u8 ssap; __be16 ctrl; - uint8_t orig[3]; + __u8 orig[3]; __be16 type; }; struct ebt_802_3_hdr { - uint8_t daddr[6]; - uint8_t saddr[6]; + __u8 daddr[6]; + __u8 saddr[6]; __be16 len; union { struct hdr_ui ui; @@ -59,10 +61,10 @@ static inline struct ebt_802_3_hdr *ebt_802_3_hdr(const struct sk_buff *skb) #endif struct ebt_802_3_info { - uint8_t sap; + __u8 sap; __be16 type; - uint8_t bitmask; - uint8_t invflags; + __u8 bitmask; + __u8 invflags; }; #endif diff --git a/include/linux/netfilter_bridge/ebt_among.h b/include/linux/netfilter_bridge/ebt_among.h index 0009558609a..bd4e3ad0b70 100644 --- a/include/linux/netfilter_bridge/ebt_among.h +++ b/include/linux/netfilter_bridge/ebt_among.h @@ -1,6 +1,8 @@ #ifndef __LINUX_BRIDGE_EBT_AMONG_H #define __LINUX_BRIDGE_EBT_AMONG_H +#include <linux/types.h> + #define EBT_AMONG_DST 0x01 #define EBT_AMONG_SRC 0x02 @@ -30,7 +32,7 @@ */ struct ebt_mac_wormhash_tuple { - uint32_t cmp[2]; + __u32 cmp[2]; __be32 ip; }; diff --git a/include/linux/netfilter_bridge/ebt_arp.h b/include/linux/netfilter_bridge/ebt_arp.h index cbf4843b6b0..522f3e427f4 100644 --- a/include/linux/netfilter_bridge/ebt_arp.h +++ b/include/linux/netfilter_bridge/ebt_arp.h @@ -1,6 +1,8 @@ #ifndef __LINUX_BRIDGE_EBT_ARP_H #define __LINUX_BRIDGE_EBT_ARP_H +#include <linux/types.h> + #define EBT_ARP_OPCODE 0x01 #define EBT_ARP_HTYPE 0x02 #define EBT_ARP_PTYPE 0x04 @@ -27,8 +29,8 @@ struct ebt_arp_info unsigned char smmsk[ETH_ALEN]; unsigned char dmaddr[ETH_ALEN]; unsigned char dmmsk[ETH_ALEN]; - uint8_t bitmask; - uint8_t invflags; + __u8 bitmask; + __u8 invflags; }; #endif diff --git a/include/linux/netfilter_bridge/ebt_ip.h b/include/linux/netfilter_bridge/ebt_ip.h index 6a708fb9224..c4bbc41b0ea 100644 --- a/include/linux/netfilter_bridge/ebt_ip.h +++ b/include/linux/netfilter_bridge/ebt_ip.h @@ -15,6 +15,8 @@ #ifndef __LINUX_BRIDGE_EBT_IP_H #define __LINUX_BRIDGE_EBT_IP_H +#include <linux/types.h> + #define EBT_IP_SOURCE 0x01 #define EBT_IP_DEST 0x02 #define EBT_IP_TOS 0x04 @@ -31,12 +33,12 @@ struct ebt_ip_info { __be32 daddr; __be32 smsk; __be32 dmsk; - uint8_t tos; - uint8_t protocol; - uint8_t bitmask; - uint8_t invflags; - uint16_t sport[2]; - uint16_t dport[2]; + __u8 tos; + __u8 protocol; + __u8 bitmask; + __u8 invflags; + __u16 sport[2]; + __u16 dport[2]; }; #endif diff --git a/include/linux/netfilter_bridge/ebt_ip6.h b/include/linux/netfilter_bridge/ebt_ip6.h index e5de9870151..42b88968272 100644 --- a/include/linux/netfilter_bridge/ebt_ip6.h +++ b/include/linux/netfilter_bridge/ebt_ip6.h @@ -12,14 +12,19 @@ #ifndef __LINUX_BRIDGE_EBT_IP6_H #define __LINUX_BRIDGE_EBT_IP6_H +#include <linux/types.h> + #define EBT_IP6_SOURCE 0x01 #define EBT_IP6_DEST 0x02 #define EBT_IP6_TCLASS 0x04 #define EBT_IP6_PROTO 0x08 #define EBT_IP6_SPORT 0x10 #define EBT_IP6_DPORT 0x20 +#define EBT_IP6_ICMP6 0x40 + #define EBT_IP6_MASK (EBT_IP6_SOURCE | EBT_IP6_DEST | EBT_IP6_TCLASS |\ - EBT_IP6_PROTO | EBT_IP6_SPORT | EBT_IP6_DPORT) + EBT_IP6_PROTO | EBT_IP6_SPORT | EBT_IP6_DPORT | \ + EBT_IP6_ICMP6) #define EBT_IP6_MATCH "ip6" /* the same values are used for the invflags */ @@ -28,12 +33,18 @@ struct ebt_ip6_info { struct in6_addr daddr; struct in6_addr smsk; struct in6_addr dmsk; - uint8_t tclass; - uint8_t protocol; - uint8_t bitmask; - uint8_t invflags; - uint16_t sport[2]; - uint16_t dport[2]; + __u8 tclass; + __u8 protocol; + __u8 bitmask; + __u8 invflags; + union { + __u16 sport[2]; + __u8 icmpv6_type[2]; + }; + union { + __u16 dport[2]; + __u8 icmpv6_code[2]; + }; }; #endif diff --git a/include/linux/netfilter_bridge/ebt_limit.h b/include/linux/netfilter_bridge/ebt_limit.h index 4bf76b75167..66d80b30ba0 100644 --- a/include/linux/netfilter_bridge/ebt_limit.h +++ b/include/linux/netfilter_bridge/ebt_limit.h @@ -1,6 +1,8 @@ #ifndef __LINUX_BRIDGE_EBT_LIMIT_H #define __LINUX_BRIDGE_EBT_LIMIT_H +#include <linux/types.h> + #define EBT_LIMIT_MATCH "limit" /* timings are in milliseconds. */ @@ -10,13 +12,13 @@ seconds, or one every 59 hours. */ struct ebt_limit_info { - u_int32_t avg; /* Average secs between packets * scale */ - u_int32_t burst; /* Period multiplier for upper limit. */ + __u32 avg; /* Average secs between packets * scale */ + __u32 burst; /* Period multiplier for upper limit. */ /* Used internally by the kernel */ unsigned long prev; - u_int32_t credit; - u_int32_t credit_cap, cost; + __u32 credit; + __u32 credit_cap, cost; }; #endif diff --git a/include/linux/netfilter_bridge/ebt_log.h b/include/linux/netfilter_bridge/ebt_log.h index cc2cdfb764b..7e7f1d1fe49 100644 --- a/include/linux/netfilter_bridge/ebt_log.h +++ b/include/linux/netfilter_bridge/ebt_log.h @@ -1,6 +1,8 @@ #ifndef __LINUX_BRIDGE_EBT_LOG_H #define __LINUX_BRIDGE_EBT_LOG_H +#include <linux/types.h> + #define EBT_LOG_IP 0x01 /* if the frame is made by ip, log the ip information */ #define EBT_LOG_ARP 0x02 #define EBT_LOG_NFLOG 0x04 @@ -10,9 +12,9 @@ #define EBT_LOG_WATCHER "log" struct ebt_log_info { - uint8_t loglevel; - uint8_t prefix[EBT_LOG_PREFIX_SIZE]; - uint32_t bitmask; + __u8 loglevel; + __u8 prefix[EBT_LOG_PREFIX_SIZE]; + __u32 bitmask; }; #endif diff --git a/include/linux/netfilter_bridge/ebt_mark_m.h b/include/linux/netfilter_bridge/ebt_mark_m.h index 9ceb10ec0ed..410f9e5a71d 100644 --- a/include/linux/netfilter_bridge/ebt_mark_m.h +++ b/include/linux/netfilter_bridge/ebt_mark_m.h @@ -1,13 +1,15 @@ #ifndef __LINUX_BRIDGE_EBT_MARK_M_H #define __LINUX_BRIDGE_EBT_MARK_M_H +#include <linux/types.h> + #define EBT_MARK_AND 0x01 #define EBT_MARK_OR 0x02 #define EBT_MARK_MASK (EBT_MARK_AND | EBT_MARK_OR) struct ebt_mark_m_info { unsigned long mark, mask; - uint8_t invert; - uint8_t bitmask; + __u8 invert; + __u8 bitmask; }; #define EBT_MARK_MATCH "mark_m" diff --git a/include/linux/netfilter_bridge/ebt_nflog.h b/include/linux/netfilter_bridge/ebt_nflog.h index 052817849b8..df829fce912 100644 --- a/include/linux/netfilter_bridge/ebt_nflog.h +++ b/include/linux/netfilter_bridge/ebt_nflog.h @@ -1,6 +1,8 @@ #ifndef __LINUX_BRIDGE_EBT_NFLOG_H #define __LINUX_BRIDGE_EBT_NFLOG_H +#include <linux/types.h> + #define EBT_NFLOG_MASK 0x0 #define EBT_NFLOG_PREFIX_SIZE 64 @@ -10,11 +12,11 @@ #define EBT_NFLOG_DEFAULT_THRESHOLD 1 struct ebt_nflog_info { - u_int32_t len; - u_int16_t group; - u_int16_t threshold; - u_int16_t flags; - u_int16_t pad; + __u32 len; + __u16 group; + __u16 threshold; + __u16 flags; + __u16 pad; char prefix[EBT_NFLOG_PREFIX_SIZE]; }; diff --git a/include/linux/netfilter_bridge/ebt_pkttype.h b/include/linux/netfilter_bridge/ebt_pkttype.h index 51a79984093..c241badcd03 100644 --- a/include/linux/netfilter_bridge/ebt_pkttype.h +++ b/include/linux/netfilter_bridge/ebt_pkttype.h @@ -1,9 +1,11 @@ #ifndef __LINUX_BRIDGE_EBT_PKTTYPE_H #define __LINUX_BRIDGE_EBT_PKTTYPE_H +#include <linux/types.h> + struct ebt_pkttype_info { - uint8_t pkt_type; - uint8_t invert; + __u8 pkt_type; + __u8 invert; }; #define EBT_PKTTYPE_MATCH "pkttype" diff --git a/include/linux/netfilter_bridge/ebt_stp.h b/include/linux/netfilter_bridge/ebt_stp.h index e503a0aa272..1025b9f5fb7 100644 --- a/include/linux/netfilter_bridge/ebt_stp.h +++ b/include/linux/netfilter_bridge/ebt_stp.h @@ -1,6 +1,8 @@ #ifndef __LINUX_BRIDGE_EBT_STP_H #define __LINUX_BRIDGE_EBT_STP_H +#include <linux/types.h> + #define EBT_STP_TYPE 0x0001 #define EBT_STP_FLAGS 0x0002 @@ -21,24 +23,24 @@ #define EBT_STP_MATCH "stp" struct ebt_stp_config_info { - uint8_t flags; - uint16_t root_priol, root_priou; + __u8 flags; + __u16 root_priol, root_priou; char root_addr[6], root_addrmsk[6]; - uint32_t root_costl, root_costu; - uint16_t sender_priol, sender_priou; + __u32 root_costl, root_costu; + __u16 sender_priol, sender_priou; char sender_addr[6], sender_addrmsk[6]; - uint16_t portl, portu; - uint16_t msg_agel, msg_ageu; - uint16_t max_agel, max_ageu; - uint16_t hello_timel, hello_timeu; - uint16_t forward_delayl, forward_delayu; + __u16 portl, portu; + __u16 msg_agel, msg_ageu; + __u16 max_agel, max_ageu; + __u16 hello_timel, hello_timeu; + __u16 forward_delayl, forward_delayu; }; struct ebt_stp_info { - uint8_t type; + __u8 type; struct ebt_stp_config_info config; - uint16_t bitmask; - uint16_t invflags; + __u16 bitmask; + __u16 invflags; }; #endif diff --git a/include/linux/netfilter_bridge/ebt_ulog.h b/include/linux/netfilter_bridge/ebt_ulog.h index b677e267154..89a6becb526 100644 --- a/include/linux/netfilter_bridge/ebt_ulog.h +++ b/include/linux/netfilter_bridge/ebt_ulog.h @@ -1,6 +1,8 @@ #ifndef _EBT_ULOG_H #define _EBT_ULOG_H +#include <linux/types.h> + #define EBT_ULOG_DEFAULT_NLGROUP 0 #define EBT_ULOG_DEFAULT_QTHRESHOLD 1 #define EBT_ULOG_MAXNLGROUPS 32 /* hardcoded netlink max */ @@ -10,7 +12,7 @@ #define EBT_ULOG_VERSION 1 struct ebt_ulog_info { - uint32_t nlgroup; + __u32 nlgroup; unsigned int cprange; unsigned int qthreshold; char prefix[EBT_ULOG_PREFIX_LEN]; diff --git a/include/linux/netfilter_bridge/ebt_vlan.h b/include/linux/netfilter_bridge/ebt_vlan.h index 1d98be4031e..967d1d5cf98 100644 --- a/include/linux/netfilter_bridge/ebt_vlan.h +++ b/include/linux/netfilter_bridge/ebt_vlan.h @@ -1,6 +1,8 @@ #ifndef __LINUX_BRIDGE_EBT_VLAN_H #define __LINUX_BRIDGE_EBT_VLAN_H +#include <linux/types.h> + #define EBT_VLAN_ID 0x01 #define EBT_VLAN_PRIO 0x02 #define EBT_VLAN_ENCAP 0x04 @@ -8,12 +10,12 @@ #define EBT_VLAN_MATCH "vlan" struct ebt_vlan_info { - uint16_t id; /* VLAN ID {1-4095} */ - uint8_t prio; /* VLAN User Priority {0-7} */ + __u16 id; /* VLAN ID {1-4095} */ + __u8 prio; /* VLAN User Priority {0-7} */ __be16 encap; /* VLAN Encapsulated frame code {0-65535} */ - uint8_t bitmask; /* Args bitmask bit 1=1 - ID arg, + __u8 bitmask; /* Args bitmask bit 1=1 - ID arg, bit 2=1 User-Priority arg, bit 3=1 encap*/ - uint8_t invflags; /* Inverse bitmask bit 1=1 - inversed ID arg, + __u8 invflags; /* Inverse bitmask bit 1=1 - inversed ID arg, bit 2=1 - inversed Pirority arg */ }; diff --git a/include/linux/netfilter_ipv4/ipt_CLUSTERIP.h b/include/linux/netfilter_ipv4/ipt_CLUSTERIP.h index e5a3687c8a7..c6a204c9704 100644 --- a/include/linux/netfilter_ipv4/ipt_CLUSTERIP.h +++ b/include/linux/netfilter_ipv4/ipt_CLUSTERIP.h @@ -1,6 +1,8 @@ #ifndef _IPT_CLUSTERIP_H_target #define _IPT_CLUSTERIP_H_target +#include <linux/types.h> + enum clusterip_hashmode { CLUSTERIP_HASHMODE_SIP = 0, CLUSTERIP_HASHMODE_SIP_SPT, @@ -17,15 +19,15 @@ struct clusterip_config; struct ipt_clusterip_tgt_info { - u_int32_t flags; + __u32 flags; /* only relevant for new ones */ - u_int8_t clustermac[6]; - u_int16_t num_total_nodes; - u_int16_t num_local_nodes; - u_int16_t local_nodes[CLUSTERIP_MAX_NODES]; - u_int32_t hash_mode; - u_int32_t hash_initval; + __u8 clustermac[6]; + __u16 num_total_nodes; + __u16 num_local_nodes; + __u16 local_nodes[CLUSTERIP_MAX_NODES]; + __u32 hash_mode; + __u32 hash_initval; /* Used internally by the kernel */ struct clusterip_config *config; diff --git a/include/linux/netfilter_ipv4/ipt_ECN.h b/include/linux/netfilter_ipv4/ipt_ECN.h index 7ca45918ab8..bb88d5315a4 100644 --- a/include/linux/netfilter_ipv4/ipt_ECN.h +++ b/include/linux/netfilter_ipv4/ipt_ECN.h @@ -8,6 +8,8 @@ */ #ifndef _IPT_ECN_TARGET_H #define _IPT_ECN_TARGET_H + +#include <linux/types.h> #include <linux/netfilter/xt_DSCP.h> #define IPT_ECN_IP_MASK (~XT_DSCP_MASK) @@ -19,11 +21,11 @@ #define IPT_ECN_OP_MASK 0xce struct ipt_ECN_info { - u_int8_t operation; /* bitset of operations */ - u_int8_t ip_ect; /* ECT codepoint of IPv4 header, pre-shifted */ + __u8 operation; /* bitset of operations */ + __u8 ip_ect; /* ECT codepoint of IPv4 header, pre-shifted */ union { struct { - u_int8_t ece:1, cwr:1; /* TCP ECT bits */ + __u8 ece:1, cwr:1; /* TCP ECT bits */ } tcp; } proto; }; diff --git a/include/linux/netfilter_ipv4/ipt_SAME.h b/include/linux/netfilter_ipv4/ipt_SAME.h index 2529660c5b3..5bca78267af 100644 --- a/include/linux/netfilter_ipv4/ipt_SAME.h +++ b/include/linux/netfilter_ipv4/ipt_SAME.h @@ -1,15 +1,17 @@ #ifndef _IPT_SAME_H #define _IPT_SAME_H +#include <linux/types.h> + #define IPT_SAME_MAX_RANGE 10 #define IPT_SAME_NODST 0x01 struct ipt_same_info { unsigned char info; - u_int32_t rangesize; - u_int32_t ipnum; - u_int32_t *iparray; + __u32 rangesize; + __u32 ipnum; + __u32 *iparray; /* hangs off end. */ struct nf_nat_range range[IPT_SAME_MAX_RANGE]; diff --git a/include/linux/netfilter_ipv4/ipt_TTL.h b/include/linux/netfilter_ipv4/ipt_TTL.h index ee6611edc11..f6ac169d92f 100644 --- a/include/linux/netfilter_ipv4/ipt_TTL.h +++ b/include/linux/netfilter_ipv4/ipt_TTL.h @@ -4,6 +4,8 @@ #ifndef _IPT_TTL_H #define _IPT_TTL_H +#include <linux/types.h> + enum { IPT_TTL_SET = 0, IPT_TTL_INC, @@ -13,8 +15,8 @@ enum { #define IPT_TTL_MAXMODE IPT_TTL_DEC struct ipt_TTL_info { - u_int8_t mode; - u_int8_t ttl; + __u8 mode; + __u8 ttl; }; diff --git a/include/linux/netfilter_ipv4/ipt_addrtype.h b/include/linux/netfilter_ipv4/ipt_addrtype.h index 446de6aef98..0da42237c8d 100644 --- a/include/linux/netfilter_ipv4/ipt_addrtype.h +++ b/include/linux/netfilter_ipv4/ipt_addrtype.h @@ -1,6 +1,8 @@ #ifndef _IPT_ADDRTYPE_H #define _IPT_ADDRTYPE_H +#include <linux/types.h> + enum { IPT_ADDRTYPE_INVERT_SOURCE = 0x0001, IPT_ADDRTYPE_INVERT_DEST = 0x0002, @@ -9,17 +11,17 @@ enum { }; struct ipt_addrtype_info_v1 { - u_int16_t source; /* source-type mask */ - u_int16_t dest; /* dest-type mask */ - u_int32_t flags; + __u16 source; /* source-type mask */ + __u16 dest; /* dest-type mask */ + __u32 flags; }; /* revision 0 */ struct ipt_addrtype_info { - u_int16_t source; /* source-type mask */ - u_int16_t dest; /* dest-type mask */ - u_int32_t invert_source; - u_int32_t invert_dest; + __u16 source; /* source-type mask */ + __u16 dest; /* dest-type mask */ + __u32 invert_source; + __u32 invert_dest; }; #endif diff --git a/include/linux/netfilter_ipv4/ipt_ah.h b/include/linux/netfilter_ipv4/ipt_ah.h index 2e555b4d05e..4e02bb0119e 100644 --- a/include/linux/netfilter_ipv4/ipt_ah.h +++ b/include/linux/netfilter_ipv4/ipt_ah.h @@ -1,9 +1,11 @@ #ifndef _IPT_AH_H #define _IPT_AH_H +#include <linux/types.h> + struct ipt_ah { - u_int32_t spis[2]; /* Security Parameter Index */ - u_int8_t invflags; /* Inverse flags */ + __u32 spis[2]; /* Security Parameter Index */ + __u8 invflags; /* Inverse flags */ }; diff --git a/include/linux/netfilter_ipv4/ipt_ecn.h b/include/linux/netfilter_ipv4/ipt_ecn.h index 9945baa4ccd..eabf95fb7d3 100644 --- a/include/linux/netfilter_ipv4/ipt_ecn.h +++ b/include/linux/netfilter_ipv4/ipt_ecn.h @@ -8,6 +8,8 @@ */ #ifndef _IPT_ECN_H #define _IPT_ECN_H + +#include <linux/types.h> #include <linux/netfilter/xt_dscp.h> #define IPT_ECN_IP_MASK (~XT_DSCP_MASK) @@ -20,12 +22,12 @@ /* match info */ struct ipt_ecn_info { - u_int8_t operation; - u_int8_t invert; - u_int8_t ip_ect; + __u8 operation; + __u8 invert; + __u8 ip_ect; union { struct { - u_int8_t ect; + __u8 ect; } tcp; } proto; }; diff --git a/include/linux/netfilter_ipv4/ipt_ttl.h b/include/linux/netfilter_ipv4/ipt_ttl.h index ee24fd86a3a..37bee444248 100644 --- a/include/linux/netfilter_ipv4/ipt_ttl.h +++ b/include/linux/netfilter_ipv4/ipt_ttl.h @@ -4,6 +4,8 @@ #ifndef _IPT_TTL_H #define _IPT_TTL_H +#include <linux/types.h> + enum { IPT_TTL_EQ = 0, /* equals */ IPT_TTL_NE, /* not equals */ @@ -13,8 +15,8 @@ enum { struct ipt_ttl_info { - u_int8_t mode; - u_int8_t ttl; + __u8 mode; + __u8 ttl; }; diff --git a/include/linux/netfilter_ipv6/ip6t_HL.h b/include/linux/netfilter_ipv6/ip6t_HL.h index afb7813d45a..ebd8ead1bb6 100644 --- a/include/linux/netfilter_ipv6/ip6t_HL.h +++ b/include/linux/netfilter_ipv6/ip6t_HL.h @@ -5,6 +5,8 @@ #ifndef _IP6T_HL_H #define _IP6T_HL_H +#include <linux/types.h> + enum { IP6T_HL_SET = 0, IP6T_HL_INC, @@ -14,8 +16,8 @@ enum { #define IP6T_HL_MAXMODE IP6T_HL_DEC struct ip6t_HL_info { - u_int8_t mode; - u_int8_t hop_limit; + __u8 mode; + __u8 hop_limit; }; diff --git a/include/linux/netfilter_ipv6/ip6t_REJECT.h b/include/linux/netfilter_ipv6/ip6t_REJECT.h index 6be6504162b..205ed62e460 100644 --- a/include/linux/netfilter_ipv6/ip6t_REJECT.h +++ b/include/linux/netfilter_ipv6/ip6t_REJECT.h @@ -1,6 +1,8 @@ #ifndef _IP6T_REJECT_H #define _IP6T_REJECT_H +#include <linux/types.h> + enum ip6t_reject_with { IP6T_ICMP6_NO_ROUTE, IP6T_ICMP6_ADM_PROHIBITED, @@ -12,7 +14,7 @@ enum ip6t_reject_with { }; struct ip6t_reject_info { - u_int32_t with; /* reject type */ + __u32 with; /* reject type */ }; #endif /*_IP6T_REJECT_H*/ diff --git a/include/linux/netfilter_ipv6/ip6t_ah.h b/include/linux/netfilter_ipv6/ip6t_ah.h index 17a745cfb2c..5da2b65cb3a 100644 --- a/include/linux/netfilter_ipv6/ip6t_ah.h +++ b/include/linux/netfilter_ipv6/ip6t_ah.h @@ -1,11 +1,13 @@ #ifndef _IP6T_AH_H #define _IP6T_AH_H +#include <linux/types.h> + struct ip6t_ah { - u_int32_t spis[2]; /* Security Parameter Index */ - u_int32_t hdrlen; /* Header Length */ - u_int8_t hdrres; /* Test of the Reserved Filed */ - u_int8_t invflags; /* Inverse flags */ + __u32 spis[2]; /* Security Parameter Index */ + __u32 hdrlen; /* Header Length */ + __u8 hdrres; /* Test of the Reserved Filed */ + __u8 invflags; /* Inverse flags */ }; #define IP6T_AH_SPI 0x01 diff --git a/include/linux/netfilter_ipv6/ip6t_frag.h b/include/linux/netfilter_ipv6/ip6t_frag.h index 3724d085092..b47f61b9e08 100644 --- a/include/linux/netfilter_ipv6/ip6t_frag.h +++ b/include/linux/netfilter_ipv6/ip6t_frag.h @@ -1,11 +1,13 @@ #ifndef _IP6T_FRAG_H #define _IP6T_FRAG_H +#include <linux/types.h> + struct ip6t_frag { - u_int32_t ids[2]; /* Security Parameter Index */ - u_int32_t hdrlen; /* Header Length */ - u_int8_t flags; /* */ - u_int8_t invflags; /* Inverse flags */ + __u32 ids[2]; /* Security Parameter Index */ + __u32 hdrlen; /* Header Length */ + __u8 flags; /* */ + __u8 invflags; /* Inverse flags */ }; #define IP6T_FRAG_IDS 0x01 diff --git a/include/linux/netfilter_ipv6/ip6t_hl.h b/include/linux/netfilter_ipv6/ip6t_hl.h index 5ef91b8319a..6e76dbc6c19 100644 --- a/include/linux/netfilter_ipv6/ip6t_hl.h +++ b/include/linux/netfilter_ipv6/ip6t_hl.h @@ -5,6 +5,8 @@ #ifndef _IP6T_HL_H #define _IP6T_HL_H +#include <linux/types.h> + enum { IP6T_HL_EQ = 0, /* equals */ IP6T_HL_NE, /* not equals */ @@ -14,8 +16,8 @@ enum { struct ip6t_hl_info { - u_int8_t mode; - u_int8_t hop_limit; + __u8 mode; + __u8 hop_limit; }; diff --git a/include/linux/netfilter_ipv6/ip6t_ipv6header.h b/include/linux/netfilter_ipv6/ip6t_ipv6header.h index 01dfd445596..efae3a20c21 100644 --- a/include/linux/netfilter_ipv6/ip6t_ipv6header.h +++ b/include/linux/netfilter_ipv6/ip6t_ipv6header.h @@ -8,10 +8,12 @@ on whether they contain certain headers */ #ifndef __IPV6HEADER_H #define __IPV6HEADER_H +#include <linux/types.h> + struct ip6t_ipv6header_info { - u_int8_t matchflags; - u_int8_t invflags; - u_int8_t modeflag; + __u8 matchflags; + __u8 invflags; + __u8 modeflag; }; #define MASK_HOPOPTS 128 diff --git a/include/linux/netfilter_ipv6/ip6t_mh.h b/include/linux/netfilter_ipv6/ip6t_mh.h index 18549bca2d1..a7729a5025c 100644 --- a/include/linux/netfilter_ipv6/ip6t_mh.h +++ b/include/linux/netfilter_ipv6/ip6t_mh.h @@ -1,10 +1,12 @@ #ifndef _IP6T_MH_H #define _IP6T_MH_H +#include <linux/types.h> + /* MH matching stuff */ struct ip6t_mh { - u_int8_t types[2]; /* MH type range */ - u_int8_t invflags; /* Inverse flags */ + __u8 types[2]; /* MH type range */ + __u8 invflags; /* Inverse flags */ }; /* Values for "invflags" field in struct ip6t_mh. */ diff --git a/include/linux/netfilter_ipv6/ip6t_opts.h b/include/linux/netfilter_ipv6/ip6t_opts.h index 62d89bcd9f9..17d419a811f 100644 --- a/include/linux/netfilter_ipv6/ip6t_opts.h +++ b/include/linux/netfilter_ipv6/ip6t_opts.h @@ -1,14 +1,16 @@ #ifndef _IP6T_OPTS_H #define _IP6T_OPTS_H +#include <linux/types.h> + #define IP6T_OPTS_OPTSNR 16 struct ip6t_opts { - u_int32_t hdrlen; /* Header Length */ - u_int8_t flags; /* */ - u_int8_t invflags; /* Inverse flags */ - u_int16_t opts[IP6T_OPTS_OPTSNR]; /* opts */ - u_int8_t optsnr; /* Nr of OPts */ + __u32 hdrlen; /* Header Length */ + __u8 flags; /* */ + __u8 invflags; /* Inverse flags */ + __u16 opts[IP6T_OPTS_OPTSNR]; /* opts */ + __u8 optsnr; /* Nr of OPts */ }; #define IP6T_OPTS_LEN 0x01 diff --git a/include/linux/netfilter_ipv6/ip6t_rt.h b/include/linux/netfilter_ipv6/ip6t_rt.h index ab91bfd2cd0..7605a5ff81c 100644 --- a/include/linux/netfilter_ipv6/ip6t_rt.h +++ b/include/linux/netfilter_ipv6/ip6t_rt.h @@ -1,18 +1,19 @@ #ifndef _IP6T_RT_H #define _IP6T_RT_H +#include <linux/types.h> /*#include <linux/in6.h>*/ #define IP6T_RT_HOPS 16 struct ip6t_rt { - u_int32_t rt_type; /* Routing Type */ - u_int32_t segsleft[2]; /* Segments Left */ - u_int32_t hdrlen; /* Header Length */ - u_int8_t flags; /* */ - u_int8_t invflags; /* Inverse flags */ + __u32 rt_type; /* Routing Type */ + __u32 segsleft[2]; /* Segments Left */ + __u32 hdrlen; /* Header Length */ + __u8 flags; /* */ + __u8 invflags; /* Inverse flags */ struct in6_addr addrs[IP6T_RT_HOPS]; /* Hops */ - u_int8_t addrnr; /* Nr of Addresses */ + __u8 addrnr; /* Nr of Addresses */ }; #define IP6T_RT_TYP 0x01 diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h index 2cfa4bc8dea..776cd93d5f7 100644 --- a/include/linux/pkt_sched.h +++ b/include/linux/pkt_sched.h @@ -481,4 +481,16 @@ struct tc_drr_stats { __u32 deficit; }; +/* MQPRIO */ +#define TC_QOPT_BITMASK 15 +#define TC_QOPT_MAX_QUEUE 16 + +struct tc_mqprio_qopt { + __u8 num_tc; + __u8 prio_tc_map[TC_QOPT_BITMASK + 1]; + __u8 hw; + __u16 count[TC_QOPT_MAX_QUEUE]; + __u16 offset[TC_QOPT_MAX_QUEUE]; +}; + #endif diff --git a/include/linux/rtc.h b/include/linux/rtc.h index 3c995b4d742..a0b639f8e80 100644 --- a/include/linux/rtc.h +++ b/include/linux/rtc.h @@ -235,8 +235,6 @@ extern int rtc_irq_set_freq(struct rtc_device *rtc, struct rtc_task *task, int freq); extern int rtc_update_irq_enable(struct rtc_device *rtc, unsigned int enabled); extern int rtc_alarm_irq_enable(struct rtc_device *rtc, unsigned int enabled); -extern int rtc_dev_update_irq_enable_emul(struct rtc_device *rtc, - unsigned int enabled); void rtc_aie_update_irq(void *private); void rtc_uie_update_irq(void *private); @@ -246,8 +244,6 @@ int rtc_register(rtc_task_t *task); int rtc_unregister(rtc_task_t *task); int rtc_control(rtc_task_t *t, unsigned int cmd, unsigned long arg); -void rtc_timer_enqueue(struct rtc_device *rtc, struct rtc_timer *timer); -void rtc_timer_remove(struct rtc_device *rtc, struct rtc_timer *timer); void rtc_timer_init(struct rtc_timer *timer, void (*f)(void* p), void* data); int rtc_timer_start(struct rtc_device *rtc, struct rtc_timer* timer, ktime_t expires, ktime_t period); diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index bf221d65d9a..6e946da9d1d 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1801,6 +1801,15 @@ static inline int pskb_trim_rcsum(struct sk_buff *skb, unsigned int len) prefetch(skb->prev), (skb != (struct sk_buff *)(queue)); \ skb = skb->prev) +#define skb_queue_reverse_walk_safe(queue, skb, tmp) \ + for (skb = (queue)->prev, tmp = skb->prev; \ + skb != (struct sk_buff *)(queue); \ + skb = tmp, tmp = skb->prev) + +#define skb_queue_reverse_walk_from_safe(queue, skb, tmp) \ + for (tmp = skb->prev; \ + skb != (struct sk_buff *)(queue); \ + skb = tmp, tmp = skb->prev) static inline bool skb_has_frag_list(const struct sk_buff *skb) { diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h index 0093dd7c1d6..800617b4ddd 100644 --- a/include/linux/virtio_config.h +++ b/include/linux/virtio_config.h @@ -109,7 +109,10 @@ static inline bool virtio_has_feature(const struct virtio_device *vdev, unsigned int fbit) { /* Did you forget to fix assumptions on max features? */ - MAYBE_BUILD_BUG_ON(fbit >= 32); + if (__builtin_constant_p(fbit)) + BUILD_BUG_ON(fbit >= 32); + else + BUG_ON(fbit >= 32); if (fbit < VIRTIO_TRANSPORT_F_START) virtio_check_driver_offered_feature(vdev, fbit); diff --git a/include/net/dst.h b/include/net/dst.h index 93b0310317b..be5a0d4c491 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -72,7 +72,7 @@ struct dst_entry { u32 _metrics[RTAX_MAX]; -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID __u32 tclassid; #else __u32 __pad2; diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 07bdb5e9e8a..65d1fcdbc63 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -55,7 +55,7 @@ struct fib_nh { int nh_weight; int nh_power; #endif -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID __u32 nh_tclassid; #endif int nh_oif; @@ -201,7 +201,7 @@ static inline int fib_lookup(struct net *net, const struct flowi *flp, extern int __net_init fib4_rules_init(struct net *net); extern void __net_exit fib4_rules_exit(struct net *net); -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID extern u32 fib_rules_tclass(struct fib_result *res); #endif @@ -235,7 +235,7 @@ extern struct fib_table *fib_hash_table(u32 id); static inline void fib_combine_itag(u32 *itag, struct fib_result *res) { -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID #ifdef CONFIG_IP_MULTIPLE_TABLES u32 rtag; #endif diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index b7bbd6c28cf..b23bea62f70 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -28,6 +28,80 @@ #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) #include <net/netfilter/nf_conntrack.h> #endif +#include <net/net_namespace.h> /* Netw namespace */ + +/* + * Generic access of ipvs struct + */ +static inline struct netns_ipvs *net_ipvs(struct net* net) +{ + return net->ipvs; +} +/* + * Get net ptr from skb in traffic cases + * use skb_sknet when call is from userland (ioctl or netlink) + */ +static inline struct net *skb_net(const struct sk_buff *skb) +{ +#ifdef CONFIG_NET_NS +#ifdef CONFIG_IP_VS_DEBUG + /* + * This is used for debug only. + * Start with the most likely hit + * End with BUG + */ + if (likely(skb->dev && skb->dev->nd_net)) + return dev_net(skb->dev); + if (skb_dst(skb)->dev) + return dev_net(skb_dst(skb)->dev); + WARN(skb->sk, "Maybe skb_sknet should be used in %s() at line:%d\n", + __func__, __LINE__); + if (likely(skb->sk && skb->sk->sk_net)) + return sock_net(skb->sk); + pr_err("There is no net ptr to find in the skb in %s() line:%d\n", + __func__, __LINE__); + BUG(); +#else + return dev_net(skb->dev ? : skb_dst(skb)->dev); +#endif +#else + return &init_net; +#endif +} + +static inline struct net *skb_sknet(const struct sk_buff *skb) +{ +#ifdef CONFIG_NET_NS +#ifdef CONFIG_IP_VS_DEBUG + /* Start with the most likely hit */ + if (likely(skb->sk && skb->sk->sk_net)) + return sock_net(skb->sk); + WARN(skb->dev, "Maybe skb_net should be used instead in %s() line:%d\n", + __func__, __LINE__); + if (likely(skb->dev && skb->dev->nd_net)) + return dev_net(skb->dev); + pr_err("There is no net ptr to find in the skb in %s() line:%d\n", + __func__, __LINE__); + BUG(); +#else + return sock_net(skb->sk); +#endif +#else + return &init_net; +#endif +} +/* + * This one needed for single_open_net since net is stored directly in + * private not as a struct i.e. seq_file_net cant be used. + */ +static inline struct net *seq_file_single_net(struct seq_file *seq) +{ +#ifdef CONFIG_NET_NS + return (struct net *)seq->private; +#else + return &init_net; +#endif +} /* Connections' size value needed by ip_vs_ctl.c */ extern int ip_vs_conn_tab_size; @@ -258,6 +332,23 @@ struct ip_vs_seq { before last resized pkt */ }; +/* + * counters per cpu + */ +struct ip_vs_counters { + __u32 conns; /* connections scheduled */ + __u32 inpkts; /* incoming packets */ + __u32 outpkts; /* outgoing packets */ + __u64 inbytes; /* incoming bytes */ + __u64 outbytes; /* outgoing bytes */ +}; +/* + * Stats per cpu + */ +struct ip_vs_cpu_stats { + struct ip_vs_counters ustats; + struct u64_stats_sync syncp; +}; /* * IPVS statistics objects @@ -279,17 +370,34 @@ struct ip_vs_estimator { }; struct ip_vs_stats { - struct ip_vs_stats_user ustats; /* statistics */ + struct ip_vs_stats_user ustats; /* statistics */ struct ip_vs_estimator est; /* estimator */ - - spinlock_t lock; /* spin lock */ + struct ip_vs_cpu_stats *cpustats; /* per cpu counters */ + spinlock_t lock; /* spin lock */ }; +/* + * Helper Macros for per cpu + * ipvs->tot_stats->ustats.count + */ +#define IPVS_STAT_INC(ipvs, count) \ + __this_cpu_inc((ipvs)->ustats->count) + +#define IPVS_STAT_ADD(ipvs, count, value) \ + do {\ + write_seqcount_begin(per_cpu_ptr((ipvs)->ustats_seq, \ + raw_smp_processor_id())); \ + __this_cpu_add((ipvs)->ustats->count, value); \ + write_seqcount_end(per_cpu_ptr((ipvs)->ustats_seq, \ + raw_smp_processor_id())); \ + } while (0) + struct dst_entry; struct iphdr; struct ip_vs_conn; struct ip_vs_app; struct sk_buff; +struct ip_vs_proto_data; struct ip_vs_protocol { struct ip_vs_protocol *next; @@ -297,21 +405,22 @@ struct ip_vs_protocol { u16 protocol; u16 num_states; int dont_defrag; - atomic_t appcnt; /* counter of proto app incs */ - int *timeout_table; /* protocol timeout table */ void (*init)(struct ip_vs_protocol *pp); void (*exit)(struct ip_vs_protocol *pp); + void (*init_netns)(struct net *net, struct ip_vs_proto_data *pd); + + void (*exit_netns)(struct net *net, struct ip_vs_proto_data *pd); + int (*conn_schedule)(int af, struct sk_buff *skb, - struct ip_vs_protocol *pp, + struct ip_vs_proto_data *pd, int *verdict, struct ip_vs_conn **cpp); struct ip_vs_conn * (*conn_in_get)(int af, const struct sk_buff *skb, - struct ip_vs_protocol *pp, const struct ip_vs_iphdr *iph, unsigned int proto_off, int inverse); @@ -319,7 +428,6 @@ struct ip_vs_protocol { struct ip_vs_conn * (*conn_out_get)(int af, const struct sk_buff *skb, - struct ip_vs_protocol *pp, const struct ip_vs_iphdr *iph, unsigned int proto_off, int inverse); @@ -337,11 +445,11 @@ struct ip_vs_protocol { int (*state_transition)(struct ip_vs_conn *cp, int direction, const struct sk_buff *skb, - struct ip_vs_protocol *pp); + struct ip_vs_proto_data *pd); - int (*register_app)(struct ip_vs_app *inc); + int (*register_app)(struct net *net, struct ip_vs_app *inc); - void (*unregister_app)(struct ip_vs_app *inc); + void (*unregister_app)(struct net *net, struct ip_vs_app *inc); int (*app_conn_bind)(struct ip_vs_conn *cp); @@ -350,14 +458,26 @@ struct ip_vs_protocol { int offset, const char *msg); - void (*timeout_change)(struct ip_vs_protocol *pp, int flags); + void (*timeout_change)(struct ip_vs_proto_data *pd, int flags); +}; - int (*set_state_timeout)(struct ip_vs_protocol *pp, char *sname, int to); +/* + * protocol data per netns + */ +struct ip_vs_proto_data { + struct ip_vs_proto_data *next; + struct ip_vs_protocol *pp; + int *timeout_table; /* protocol timeout table */ + atomic_t appcnt; /* counter of proto app incs. */ + struct tcp_states_t *tcp_state_table; }; -extern struct ip_vs_protocol * ip_vs_proto_get(unsigned short proto); +extern struct ip_vs_protocol *ip_vs_proto_get(unsigned short proto); +extern struct ip_vs_proto_data *ip_vs_proto_data_get(struct net *net, + unsigned short proto); struct ip_vs_conn_param { + struct net *net; const union nf_inet_addr *caddr; const union nf_inet_addr *vaddr; __be16 cport; @@ -375,16 +495,19 @@ struct ip_vs_conn_param { */ struct ip_vs_conn { struct list_head c_list; /* hashed list heads */ - +#ifdef CONFIG_NET_NS + struct net *net; /* Name space */ +#endif /* Protocol, addresses and port numbers */ - u16 af; /* address family */ - union nf_inet_addr caddr; /* client address */ - union nf_inet_addr vaddr; /* virtual address */ - union nf_inet_addr daddr; /* destination address */ - volatile __u32 flags; /* status flags */ - __be16 cport; - __be16 vport; - __be16 dport; + u16 af; /* address family */ + __be16 cport; + __be16 vport; + __be16 dport; + __u32 fwmark; /* Fire wall mark from skb */ + union nf_inet_addr caddr; /* client address */ + union nf_inet_addr vaddr; /* virtual address */ + union nf_inet_addr daddr; /* destination address */ + volatile __u32 flags; /* status flags */ __u16 protocol; /* Which protocol (TCP/UDP) */ /* counter and timer */ @@ -422,10 +545,38 @@ struct ip_vs_conn { struct ip_vs_seq in_seq; /* incoming seq. struct */ struct ip_vs_seq out_seq; /* outgoing seq. struct */ + const struct ip_vs_pe *pe; char *pe_data; __u8 pe_data_len; }; +/* + * To save some memory in conn table when name space is disabled. + */ +static inline struct net *ip_vs_conn_net(const struct ip_vs_conn *cp) +{ +#ifdef CONFIG_NET_NS + return cp->net; +#else + return &init_net; +#endif +} +static inline void ip_vs_conn_net_set(struct ip_vs_conn *cp, struct net *net) +{ +#ifdef CONFIG_NET_NS + cp->net = net; +#endif +} + +static inline int ip_vs_conn_net_eq(const struct ip_vs_conn *cp, + struct net *net) +{ +#ifdef CONFIG_NET_NS + return cp->net == net; +#else + return 1; +#endif +} /* * Extended internal versions of struct ip_vs_service_user and @@ -485,6 +636,7 @@ struct ip_vs_service { unsigned flags; /* service status flags */ unsigned timeout; /* persistent timeout in ticks */ __be32 netmask; /* grouping granularity */ + struct net *net; struct list_head destinations; /* real server d-linked list */ __u32 num_dests; /* number of servers */ @@ -510,8 +662,8 @@ struct ip_vs_dest { struct list_head d_list; /* for table with all the dests */ u16 af; /* address family */ - union nf_inet_addr addr; /* IP address of the server */ __be16 port; /* port number of the server */ + union nf_inet_addr addr; /* IP address of the server */ volatile unsigned flags; /* dest status flags */ atomic_t conn_flags; /* flags to copy to conn */ atomic_t weight; /* server weight */ @@ -538,8 +690,8 @@ struct ip_vs_dest { /* for virtual service */ struct ip_vs_service *svc; /* service it belongs to */ __u16 protocol; /* which protocol (TCP/UDP) */ - union nf_inet_addr vaddr; /* virtual IP address */ __be16 vport; /* virtual port number */ + union nf_inet_addr vaddr; /* virtual IP address */ __u32 vfwmark; /* firewall mark of service */ }; @@ -674,13 +826,14 @@ enum { IP_VS_DIR_LAST, }; -static inline void ip_vs_conn_fill_param(int af, int protocol, +static inline void ip_vs_conn_fill_param(struct net *net, int af, int protocol, const union nf_inet_addr *caddr, __be16 cport, const union nf_inet_addr *vaddr, __be16 vport, struct ip_vs_conn_param *p) { + p->net = net; p->af = af; p->protocol = protocol; p->caddr = caddr; @@ -695,7 +848,6 @@ struct ip_vs_conn *ip_vs_conn_in_get(const struct ip_vs_conn_param *p); struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p); struct ip_vs_conn * ip_vs_conn_in_get_proto(int af, const struct sk_buff *skb, - struct ip_vs_protocol *pp, const struct ip_vs_iphdr *iph, unsigned int proto_off, int inverse); @@ -703,7 +855,6 @@ struct ip_vs_conn * ip_vs_conn_in_get_proto(int af, const struct sk_buff *skb, struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p); struct ip_vs_conn * ip_vs_conn_out_get_proto(int af, const struct sk_buff *skb, - struct ip_vs_protocol *pp, const struct ip_vs_iphdr *iph, unsigned int proto_off, int inverse); @@ -719,14 +870,14 @@ extern void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport); struct ip_vs_conn *ip_vs_conn_new(const struct ip_vs_conn_param *p, const union nf_inet_addr *daddr, __be16 dport, unsigned flags, - struct ip_vs_dest *dest); + struct ip_vs_dest *dest, __u32 fwmark); extern void ip_vs_conn_expire_now(struct ip_vs_conn *cp); extern const char * ip_vs_state_name(__u16 proto, int state); -extern void ip_vs_tcp_conn_listen(struct ip_vs_conn *cp); +extern void ip_vs_tcp_conn_listen(struct net *net, struct ip_vs_conn *cp); extern int ip_vs_check_template(struct ip_vs_conn *ct); -extern void ip_vs_random_dropentry(void); +extern void ip_vs_random_dropentry(struct net *net); extern int ip_vs_conn_init(void); extern void ip_vs_conn_cleanup(void); @@ -796,12 +947,12 @@ ip_vs_control_add(struct ip_vs_conn *cp, struct ip_vs_conn *ctl_cp) * (from ip_vs_app.c) */ #define IP_VS_APP_MAX_PORTS 8 -extern int register_ip_vs_app(struct ip_vs_app *app); -extern void unregister_ip_vs_app(struct ip_vs_app *app); +extern int register_ip_vs_app(struct net *net, struct ip_vs_app *app); +extern void unregister_ip_vs_app(struct net *net, struct ip_vs_app *app); extern int ip_vs_bind_app(struct ip_vs_conn *cp, struct ip_vs_protocol *pp); extern void ip_vs_unbind_app(struct ip_vs_conn *cp); -extern int -register_ip_vs_app_inc(struct ip_vs_app *app, __u16 proto, __u16 port); +extern int register_ip_vs_app_inc(struct net *net, struct ip_vs_app *app, + __u16 proto, __u16 port); extern int ip_vs_app_inc_get(struct ip_vs_app *inc); extern void ip_vs_app_inc_put(struct ip_vs_app *inc); @@ -814,15 +965,27 @@ void ip_vs_bind_pe(struct ip_vs_service *svc, struct ip_vs_pe *pe); void ip_vs_unbind_pe(struct ip_vs_service *svc); int register_ip_vs_pe(struct ip_vs_pe *pe); int unregister_ip_vs_pe(struct ip_vs_pe *pe); -extern struct ip_vs_pe *ip_vs_pe_get(const char *name); -extern void ip_vs_pe_put(struct ip_vs_pe *pe); +struct ip_vs_pe *ip_vs_pe_getbyname(const char *name); +struct ip_vs_pe *__ip_vs_pe_getbyname(const char *pe_name); + +static inline void ip_vs_pe_get(const struct ip_vs_pe *pe) +{ + if (pe && pe->module) + __module_get(pe->module); +} + +static inline void ip_vs_pe_put(const struct ip_vs_pe *pe) +{ + if (pe && pe->module) + module_put(pe->module); +} /* * IPVS protocol functions (from ip_vs_proto.c) */ extern int ip_vs_protocol_init(void); extern void ip_vs_protocol_cleanup(void); -extern void ip_vs_protocol_timeout_change(int flags); +extern void ip_vs_protocol_timeout_change(struct netns_ipvs *ipvs, int flags); extern int *ip_vs_create_timeout_table(int *table, int size); extern int ip_vs_set_state_timeout(int *table, int num, const char *const *names, @@ -852,26 +1015,21 @@ extern struct ip_vs_scheduler *ip_vs_scheduler_get(const char *sched_name); extern void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler); extern struct ip_vs_conn * ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, - struct ip_vs_protocol *pp, int *ignored); + struct ip_vs_proto_data *pd, int *ignored); extern int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, - struct ip_vs_protocol *pp); + struct ip_vs_proto_data *pd); /* * IPVS control data and functions (from ip_vs_ctl.c) */ -extern int sysctl_ip_vs_cache_bypass; -extern int sysctl_ip_vs_expire_nodest_conn; -extern int sysctl_ip_vs_expire_quiescent_template; -extern int sysctl_ip_vs_sync_threshold[2]; -extern int sysctl_ip_vs_nat_icmp_send; -extern int sysctl_ip_vs_conntrack; -extern int sysctl_ip_vs_snat_reroute; extern struct ip_vs_stats ip_vs_stats; extern const struct ctl_path net_vs_ctl_path[]; +extern int sysctl_ip_vs_sync_ver; +extern void ip_vs_sync_switch_mode(struct net *net, int mode); extern struct ip_vs_service * -ip_vs_service_get(int af, __u32 fwmark, __u16 protocol, +ip_vs_service_get(struct net *net, int af, __u32 fwmark, __u16 protocol, const union nf_inet_addr *vaddr, __be16 vport); static inline void ip_vs_service_put(struct ip_vs_service *svc) @@ -880,7 +1038,7 @@ static inline void ip_vs_service_put(struct ip_vs_service *svc) } extern struct ip_vs_dest * -ip_vs_lookup_real_service(int af, __u16 protocol, +ip_vs_lookup_real_service(struct net *net, int af, __u16 protocol, const union nf_inet_addr *daddr, __be16 dport); extern int ip_vs_use_count_inc(void); @@ -888,8 +1046,9 @@ extern void ip_vs_use_count_dec(void); extern int ip_vs_control_init(void); extern void ip_vs_control_cleanup(void); extern struct ip_vs_dest * -ip_vs_find_dest(int af, const union nf_inet_addr *daddr, __be16 dport, - const union nf_inet_addr *vaddr, __be16 vport, __u16 protocol); +ip_vs_find_dest(struct net *net, int af, const union nf_inet_addr *daddr, + __be16 dport, const union nf_inet_addr *vaddr, __be16 vport, + __u16 protocol, __u32 fwmark); extern struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp); @@ -897,14 +1056,12 @@ extern struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp); * IPVS sync daemon data and function prototypes * (from ip_vs_sync.c) */ -extern volatile int ip_vs_sync_state; -extern volatile int ip_vs_master_syncid; -extern volatile int ip_vs_backup_syncid; -extern char ip_vs_master_mcast_ifn[IP_VS_IFNAME_MAXLEN]; -extern char ip_vs_backup_mcast_ifn[IP_VS_IFNAME_MAXLEN]; -extern int start_sync_thread(int state, char *mcast_ifn, __u8 syncid); -extern int stop_sync_thread(int state); -extern void ip_vs_sync_conn(struct ip_vs_conn *cp); +extern int start_sync_thread(struct net *net, int state, char *mcast_ifn, + __u8 syncid); +extern int stop_sync_thread(struct net *net, int state); +extern void ip_vs_sync_conn(struct net *net, struct ip_vs_conn *cp); +extern int ip_vs_sync_init(void); +extern void ip_vs_sync_cleanup(void); /* @@ -912,8 +1069,8 @@ extern void ip_vs_sync_conn(struct ip_vs_conn *cp); */ extern int ip_vs_estimator_init(void); extern void ip_vs_estimator_cleanup(void); -extern void ip_vs_new_estimator(struct ip_vs_stats *stats); -extern void ip_vs_kill_estimator(struct ip_vs_stats *stats); +extern void ip_vs_new_estimator(struct net *net, struct ip_vs_stats *stats); +extern void ip_vs_kill_estimator(struct net *net, struct ip_vs_stats *stats); extern void ip_vs_zero_estimator(struct ip_vs_stats *stats); /* @@ -955,11 +1112,13 @@ extern int ip_vs_icmp_xmit_v6 extern int ip_vs_drop_rate; extern int ip_vs_drop_counter; -static __inline__ int ip_vs_todrop(void) +static inline int ip_vs_todrop(struct netns_ipvs *ipvs) { - if (!ip_vs_drop_rate) return 0; - if (--ip_vs_drop_counter > 0) return 0; - ip_vs_drop_counter = ip_vs_drop_rate; + if (!ipvs->drop_rate) + return 0; + if (--ipvs->drop_counter > 0) + return 0; + ipvs->drop_counter = ipvs->drop_rate; return 1; } @@ -1047,9 +1206,9 @@ static inline void ip_vs_notrack(struct sk_buff *skb) * Netfilter connection tracking * (from ip_vs_nfct.c) */ -static inline int ip_vs_conntrack_enabled(void) +static inline int ip_vs_conntrack_enabled(struct netns_ipvs *ipvs) { - return sysctl_ip_vs_conntrack; + return ipvs->sysctl_conntrack; } extern void ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp, @@ -1062,7 +1221,7 @@ extern void ip_vs_conn_drop_conntrack(struct ip_vs_conn *cp); #else -static inline int ip_vs_conntrack_enabled(void) +static inline int ip_vs_conntrack_enabled(struct netns_ipvs *ipvs) { return 0; } diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 1bf812b21fb..b3b4a34cb2c 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -20,6 +20,7 @@ #include <net/netns/conntrack.h> #endif #include <net/netns/xfrm.h> +#include <net/netns/ip_vs.h> struct proc_dir_entry; struct net_device; @@ -94,6 +95,7 @@ struct net { #ifdef CONFIG_XFRM struct netns_xfrm xfrm; #endif + struct netns_ipvs *ipvs; }; diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index d85cff10e16..d0d13378991 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -50,11 +50,24 @@ union nf_conntrack_expect_proto { /* per conntrack: application helper private data */ union nf_conntrack_help { /* insert conntrack helper private data (master) here */ +#if defined(CONFIG_NF_CONNTRACK_FTP) || defined(CONFIG_NF_CONNTRACK_FTP_MODULE) struct nf_ct_ftp_master ct_ftp_info; +#endif +#if defined(CONFIG_NF_CONNTRACK_PPTP) || \ + defined(CONFIG_NF_CONNTRACK_PPTP_MODULE) struct nf_ct_pptp_master ct_pptp_info; +#endif +#if defined(CONFIG_NF_CONNTRACK_H323) || \ + defined(CONFIG_NF_CONNTRACK_H323_MODULE) struct nf_ct_h323_master ct_h323_info; +#endif +#if defined(CONFIG_NF_CONNTRACK_SANE) || \ + defined(CONFIG_NF_CONNTRACK_SANE_MODULE) struct nf_ct_sane_master ct_sane_info; +#endif +#if defined(CONFIG_NF_CONNTRACK_SIP) || defined(CONFIG_NF_CONNTRACK_SIP_MODULE) struct nf_ct_sip_master ct_sip_info; +#endif }; #include <linux/types.h> @@ -116,14 +129,14 @@ struct nf_conn { u_int32_t secmark; #endif - /* Storage reserved for other modules: */ - union nf_conntrack_proto proto; - /* Extensions */ struct nf_ct_ext *ext; #ifdef CONFIG_NET_NS struct net *ct_net; #endif + + /* Storage reserved for other modules, must be the last member */ + union nf_conntrack_proto proto; }; static inline struct nf_conn * @@ -189,9 +202,9 @@ extern void nf_ct_l3proto_module_put(unsigned short l3proto); * Allocate a hashtable of hlist_head (if nulls == 0), * or hlist_nulls_head (if nulls == 1) */ -extern void *nf_ct_alloc_hashtable(unsigned int *sizep, int *vmalloced, int nulls); +extern void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls); -extern void nf_ct_free_hashtable(void *hash, int vmalloced, unsigned int size); +extern void nf_ct_free_hashtable(void *hash, unsigned int size); extern struct nf_conntrack_tuple_hash * __nf_conntrack_find(struct net *net, u16 zone, diff --git a/include/net/netfilter/nf_conntrack_ecache.h b/include/net/netfilter/nf_conntrack_ecache.h index 96ba5f7dcab..8fdb04b8cce 100644 --- a/include/net/netfilter/nf_conntrack_ecache.h +++ b/include/net/netfilter/nf_conntrack_ecache.h @@ -23,12 +23,17 @@ struct nf_conntrack_ecache { static inline struct nf_conntrack_ecache * nf_ct_ecache_find(const struct nf_conn *ct) { +#ifdef CONFIG_NF_CONNTRACK_EVENTS return nf_ct_ext_find(ct, NF_CT_EXT_ECACHE); +#else + return NULL; +#endif } static inline struct nf_conntrack_ecache * nf_ct_ecache_ext_add(struct nf_conn *ct, u16 ctmask, u16 expmask, gfp_t gfp) { +#ifdef CONFIG_NF_CONNTRACK_EVENTS struct net *net = nf_ct_net(ct); struct nf_conntrack_ecache *e; @@ -45,6 +50,9 @@ nf_ct_ecache_ext_add(struct nf_conn *ct, u16 ctmask, u16 expmask, gfp_t gfp) e->expmask = expmask; } return e; +#else + return NULL; +#endif }; #ifdef CONFIG_NF_CONNTRACK_EVENTS @@ -59,7 +67,7 @@ struct nf_ct_event_notifier { int (*fcn)(unsigned int events, struct nf_ct_event *item); }; -extern struct nf_ct_event_notifier *nf_conntrack_event_cb; +extern struct nf_ct_event_notifier __rcu *nf_conntrack_event_cb; extern int nf_conntrack_register_notifier(struct nf_ct_event_notifier *nb); extern void nf_conntrack_unregister_notifier(struct nf_ct_event_notifier *nb); @@ -159,7 +167,7 @@ struct nf_exp_event_notifier { int (*fcn)(unsigned int events, struct nf_exp_event *item); }; -extern struct nf_exp_event_notifier *nf_expect_event_cb; +extern struct nf_exp_event_notifier __rcu *nf_expect_event_cb; extern int nf_ct_expect_register_notifier(struct nf_exp_event_notifier *nb); extern void nf_ct_expect_unregister_notifier(struct nf_exp_event_notifier *nb); diff --git a/include/net/netfilter/nf_conntrack_extend.h b/include/net/netfilter/nf_conntrack_extend.h index 0772d296dfd..2dcf31703ac 100644 --- a/include/net/netfilter/nf_conntrack_extend.h +++ b/include/net/netfilter/nf_conntrack_extend.h @@ -7,10 +7,19 @@ enum nf_ct_ext_id { NF_CT_EXT_HELPER, +#if defined(CONFIG_NF_NAT) || defined(CONFIG_NF_NAT_MODULE) NF_CT_EXT_NAT, +#endif NF_CT_EXT_ACCT, +#ifdef CONFIG_NF_CONNTRACK_EVENTS NF_CT_EXT_ECACHE, +#endif +#ifdef CONFIG_NF_CONNTRACK_ZONES NF_CT_EXT_ZONE, +#endif +#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP + NF_CT_EXT_TSTAMP, +#endif NF_CT_EXT_NUM, }; @@ -19,6 +28,7 @@ enum nf_ct_ext_id { #define NF_CT_EXT_ACCT_TYPE struct nf_conn_counter #define NF_CT_EXT_ECACHE_TYPE struct nf_conntrack_ecache #define NF_CT_EXT_ZONE_TYPE struct nf_conntrack_zone +#define NF_CT_EXT_TSTAMP_TYPE struct nf_conn_tstamp /* Extensions: optional stuff which isn't permanently in struct. */ struct nf_ct_ext { diff --git a/include/net/netfilter/nf_conntrack_helper.h b/include/net/netfilter/nf_conntrack_helper.h index 32c305dbdab..f1c1311adc2 100644 --- a/include/net/netfilter/nf_conntrack_helper.h +++ b/include/net/netfilter/nf_conntrack_helper.h @@ -63,4 +63,10 @@ static inline struct nf_conn_help *nfct_help(const struct nf_conn *ct) extern int nf_conntrack_helper_init(void); extern void nf_conntrack_helper_fini(void); +extern int nf_conntrack_broadcast_help(struct sk_buff *skb, + unsigned int protoff, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned int timeout); + #endif /*_NF_CONNTRACK_HELPER_H*/ diff --git a/include/net/netfilter/nf_conntrack_l3proto.h b/include/net/netfilter/nf_conntrack_l3proto.h index a7547611e8f..e8010f445ae 100644 --- a/include/net/netfilter/nf_conntrack_l3proto.h +++ b/include/net/netfilter/nf_conntrack_l3proto.h @@ -73,7 +73,7 @@ struct nf_conntrack_l3proto { struct module *me; }; -extern struct nf_conntrack_l3proto *nf_ct_l3protos[AF_MAX]; +extern struct nf_conntrack_l3proto __rcu *nf_ct_l3protos[AF_MAX]; /* Protocol registration. */ extern int nf_conntrack_l3proto_register(struct nf_conntrack_l3proto *proto); diff --git a/include/net/netfilter/nf_conntrack_timestamp.h b/include/net/netfilter/nf_conntrack_timestamp.h new file mode 100644 index 00000000000..fc9c82b1f06 --- /dev/null +++ b/include/net/netfilter/nf_conntrack_timestamp.h @@ -0,0 +1,65 @@ +#ifndef _NF_CONNTRACK_TSTAMP_H +#define _NF_CONNTRACK_TSTAMP_H + +#include <net/net_namespace.h> +#include <linux/netfilter/nf_conntrack_common.h> +#include <linux/netfilter/nf_conntrack_tuple_common.h> +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_extend.h> + +struct nf_conn_tstamp { + u_int64_t start; + u_int64_t stop; +}; + +static inline +struct nf_conn_tstamp *nf_conn_tstamp_find(const struct nf_conn *ct) +{ +#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP + return nf_ct_ext_find(ct, NF_CT_EXT_TSTAMP); +#else + return NULL; +#endif +} + +static inline +struct nf_conn_tstamp *nf_ct_tstamp_ext_add(struct nf_conn *ct, gfp_t gfp) +{ +#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP + struct net *net = nf_ct_net(ct); + + if (!net->ct.sysctl_tstamp) + return NULL; + + return nf_ct_ext_add(ct, NF_CT_EXT_TSTAMP, gfp); +#else + return NULL; +#endif +}; + +static inline bool nf_ct_tstamp_enabled(struct net *net) +{ + return net->ct.sysctl_tstamp != 0; +} + +static inline void nf_ct_set_tstamp(struct net *net, bool enable) +{ + net->ct.sysctl_tstamp = enable; +} + +#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP +extern int nf_conntrack_tstamp_init(struct net *net); +extern void nf_conntrack_tstamp_fini(struct net *net); +#else +static inline int nf_conntrack_tstamp_init(struct net *net) +{ + return 0; +} + +static inline void nf_conntrack_tstamp_fini(struct net *net) +{ + return; +} +#endif /* CONFIG_NF_CONNTRACK_TIMESTAMP */ + +#endif /* _NF_CONNTRACK_TSTAMP_H */ diff --git a/include/net/netfilter/nf_nat.h b/include/net/netfilter/nf_nat.h index f5f09f032a9..aff80b190c1 100644 --- a/include/net/netfilter/nf_nat.h +++ b/include/net/netfilter/nf_nat.h @@ -56,7 +56,9 @@ struct nf_nat_multi_range_compat { /* per conntrack: nat application helper private data */ union nf_conntrack_nat_help { /* insert nat helper private data here */ +#if defined(CONFIG_NF_NAT_PPTP) || defined(CONFIG_NF_NAT_PPTP_MODULE) struct nf_nat_pptp nat_pptp_info; +#endif }; struct nf_conn; @@ -84,7 +86,11 @@ extern int nf_nat_used_tuple(const struct nf_conntrack_tuple *tuple, static inline struct nf_conn_nat *nfct_nat(const struct nf_conn *ct) { +#if defined(CONFIG_NF_NAT) || defined(CONFIG_NF_NAT_MODULE) return nf_ct_ext_find(ct, NF_CT_EXT_NAT); +#else + return NULL; +#endif } #else /* !__KERNEL__: iptables wants this to compile. */ diff --git a/include/net/netfilter/nf_nat_core.h b/include/net/netfilter/nf_nat_core.h index 33602ab6619..3dc7b98effe 100644 --- a/include/net/netfilter/nf_nat_core.h +++ b/include/net/netfilter/nf_nat_core.h @@ -21,9 +21,9 @@ static inline int nf_nat_initialized(struct nf_conn *ct, enum nf_nat_manip_type manip) { if (manip == IP_NAT_MANIP_SRC) - return test_bit(IPS_SRC_NAT_DONE_BIT, &ct->status); + return ct->status & IPS_SRC_NAT_DONE; else - return test_bit(IPS_DST_NAT_DONE_BIT, &ct->status); + return ct->status & IPS_DST_NAT_DONE; } struct nlattr; diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h index d4958d4c657..341eb089349 100644 --- a/include/net/netns/conntrack.h +++ b/include/net/netns/conntrack.h @@ -21,15 +21,15 @@ struct netns_ct { int sysctl_events; unsigned int sysctl_events_retry_timeout; int sysctl_acct; + int sysctl_tstamp; int sysctl_checksum; unsigned int sysctl_log_invalid; /* Log invalid packets */ #ifdef CONFIG_SYSCTL struct ctl_table_header *sysctl_header; struct ctl_table_header *acct_sysctl_header; + struct ctl_table_header *tstamp_sysctl_header; struct ctl_table_header *event_sysctl_header; #endif - int hash_vmalloc; - int expect_vmalloc; char *slabname; }; #endif diff --git a/include/net/netns/ip_vs.h b/include/net/netns/ip_vs.h new file mode 100644 index 00000000000..259ebac904b --- /dev/null +++ b/include/net/netns/ip_vs.h @@ -0,0 +1,143 @@ +/* + * IP Virtual Server + * Data structure for network namspace + * + */ + +#ifndef IP_VS_H_ +#define IP_VS_H_ + +#include <linux/list.h> +#include <linux/mutex.h> +#include <linux/list_nulls.h> +#include <linux/ip_vs.h> +#include <asm/atomic.h> +#include <linux/in.h> + +struct ip_vs_stats; +struct ip_vs_sync_buff; +struct ctl_table_header; + +struct netns_ipvs { + int gen; /* Generation */ + /* + * Hash table: for real service lookups + */ + #define IP_VS_RTAB_BITS 4 + #define IP_VS_RTAB_SIZE (1 << IP_VS_RTAB_BITS) + #define IP_VS_RTAB_MASK (IP_VS_RTAB_SIZE - 1) + + struct list_head rs_table[IP_VS_RTAB_SIZE]; + /* ip_vs_app */ + struct list_head app_list; + struct mutex app_mutex; + struct lock_class_key app_key; /* mutex debuging */ + + /* ip_vs_proto */ + #define IP_VS_PROTO_TAB_SIZE 32 /* must be power of 2 */ + struct ip_vs_proto_data *proto_data_table[IP_VS_PROTO_TAB_SIZE]; + /* ip_vs_proto_tcp */ +#ifdef CONFIG_IP_VS_PROTO_TCP + #define TCP_APP_TAB_BITS 4 + #define TCP_APP_TAB_SIZE (1 << TCP_APP_TAB_BITS) + #define TCP_APP_TAB_MASK (TCP_APP_TAB_SIZE - 1) + struct list_head tcp_apps[TCP_APP_TAB_SIZE]; + spinlock_t tcp_app_lock; +#endif + /* ip_vs_proto_udp */ +#ifdef CONFIG_IP_VS_PROTO_UDP + #define UDP_APP_TAB_BITS 4 + #define UDP_APP_TAB_SIZE (1 << UDP_APP_TAB_BITS) + #define UDP_APP_TAB_MASK (UDP_APP_TAB_SIZE - 1) + struct list_head udp_apps[UDP_APP_TAB_SIZE]; + spinlock_t udp_app_lock; +#endif + /* ip_vs_proto_sctp */ +#ifdef CONFIG_IP_VS_PROTO_SCTP + #define SCTP_APP_TAB_BITS 4 + #define SCTP_APP_TAB_SIZE (1 << SCTP_APP_TAB_BITS) + #define SCTP_APP_TAB_MASK (SCTP_APP_TAB_SIZE - 1) + /* Hash table for SCTP application incarnations */ + struct list_head sctp_apps[SCTP_APP_TAB_SIZE]; + spinlock_t sctp_app_lock; +#endif + /* ip_vs_conn */ + atomic_t conn_count; /* connection counter */ + + /* ip_vs_ctl */ + struct ip_vs_stats *tot_stats; /* Statistics & est. */ + struct ip_vs_cpu_stats __percpu *cpustats; /* Stats per cpu */ + seqcount_t *ustats_seq; /* u64 read retry */ + + int num_services; /* no of virtual services */ + /* 1/rate drop and drop-entry variables */ + struct delayed_work defense_work; /* Work handler */ + int drop_rate; + int drop_counter; + atomic_t dropentry; + /* locks in ctl.c */ + spinlock_t dropentry_lock; /* drop entry handling */ + spinlock_t droppacket_lock; /* drop packet handling */ + spinlock_t securetcp_lock; /* state and timeout tables */ + rwlock_t rs_lock; /* real services table */ + /* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */ + struct lock_class_key ctl_key; /* ctl_mutex debuging */ + /* Trash for destinations */ + struct list_head dest_trash; + /* Service counters */ + atomic_t ftpsvc_counter; + atomic_t nullsvc_counter; + + /* sys-ctl struct */ + struct ctl_table_header *sysctl_hdr; + struct ctl_table *sysctl_tbl; + /* sysctl variables */ + int sysctl_amemthresh; + int sysctl_am_droprate; + int sysctl_drop_entry; + int sysctl_drop_packet; + int sysctl_secure_tcp; +#ifdef CONFIG_IP_VS_NFCT + int sysctl_conntrack; +#endif + int sysctl_snat_reroute; + int sysctl_sync_ver; + int sysctl_cache_bypass; + int sysctl_expire_nodest_conn; + int sysctl_expire_quiescent_template; + int sysctl_sync_threshold[2]; + int sysctl_nat_icmp_send; + + /* ip_vs_lblc */ + int sysctl_lblc_expiration; + struct ctl_table_header *lblc_ctl_header; + struct ctl_table *lblc_ctl_table; + /* ip_vs_lblcr */ + int sysctl_lblcr_expiration; + struct ctl_table_header *lblcr_ctl_header; + struct ctl_table *lblcr_ctl_table; + /* ip_vs_est */ + struct list_head est_list; /* estimator list */ + spinlock_t est_lock; + struct timer_list est_timer; /* Estimation timer */ + /* ip_vs_sync */ + struct list_head sync_queue; + spinlock_t sync_lock; + struct ip_vs_sync_buff *sync_buff; + spinlock_t sync_buff_lock; + struct sockaddr_in sync_mcast_addr; + struct task_struct *master_thread; + struct task_struct *backup_thread; + int send_mesg_maxlen; + int recv_mesg_maxlen; + volatile int sync_state; + volatile int master_syncid; + volatile int backup_syncid; + /* multicast interface name */ + char master_mcast_ifn[IP_VS_IFNAME_MAXLEN]; + char backup_mcast_ifn[IP_VS_IFNAME_MAXLEN]; + /* net name space ptr */ + struct net *net; /* Needed by timer routines */ +}; + +#endif /* IP_VS_H_ */ diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index d68c3f12177..e2e2ef57eca 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -43,7 +43,6 @@ struct netns_ipv4 { struct xt_table *nat_table; struct hlist_head *nat_bysource; unsigned int nat_htable_size; - int nat_vmalloced; #endif int sysctl_icmp_echo_ignore_all; diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index e9eee99d8b1..16626a04cb0 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -31,10 +31,12 @@ enum qdisc_state_t { * following bits are only changed while qdisc lock is held */ enum qdisc___state_t { - __QDISC___STATE_RUNNING, + __QDISC___STATE_RUNNING = 1, + __QDISC___STATE_THROTTLED = 2, }; struct qdisc_size_table { + struct rcu_head rcu; struct list_head list; struct tc_sizespec szopts; int refcnt; @@ -46,14 +48,13 @@ struct Qdisc { struct sk_buff * (*dequeue)(struct Qdisc *dev); unsigned flags; #define TCQ_F_BUILTIN 1 -#define TCQ_F_THROTTLED 2 -#define TCQ_F_INGRESS 4 -#define TCQ_F_CAN_BYPASS 8 -#define TCQ_F_MQROOT 16 +#define TCQ_F_INGRESS 2 +#define TCQ_F_CAN_BYPASS 4 +#define TCQ_F_MQROOT 8 #define TCQ_F_WARN_NONWC (1 << 16) int padded; struct Qdisc_ops *ops; - struct qdisc_size_table *stab; + struct qdisc_size_table __rcu *stab; struct list_head list; u32 handle; u32 parent; @@ -78,25 +79,43 @@ struct Qdisc { unsigned long state; struct sk_buff_head q; struct gnet_stats_basic_packed bstats; - unsigned long __state; + unsigned int __state; struct gnet_stats_queue qstats; struct rcu_head rcu_head; spinlock_t busylock; }; -static inline bool qdisc_is_running(struct Qdisc *qdisc) +static inline bool qdisc_is_running(const struct Qdisc *qdisc) { - return test_bit(__QDISC___STATE_RUNNING, &qdisc->__state); + return (qdisc->__state & __QDISC___STATE_RUNNING) ? true : false; } static inline bool qdisc_run_begin(struct Qdisc *qdisc) { - return !__test_and_set_bit(__QDISC___STATE_RUNNING, &qdisc->__state); + if (qdisc_is_running(qdisc)) + return false; + qdisc->__state |= __QDISC___STATE_RUNNING; + return true; } static inline void qdisc_run_end(struct Qdisc *qdisc) { - __clear_bit(__QDISC___STATE_RUNNING, &qdisc->__state); + qdisc->__state &= ~__QDISC___STATE_RUNNING; +} + +static inline bool qdisc_is_throttled(const struct Qdisc *qdisc) +{ + return (qdisc->__state & __QDISC___STATE_THROTTLED) ? true : false; +} + +static inline void qdisc_throttled(struct Qdisc *qdisc) +{ + qdisc->__state |= __QDISC___STATE_THROTTLED; +} + +static inline void qdisc_unthrottled(struct Qdisc *qdisc) +{ + qdisc->__state &= ~__QDISC___STATE_THROTTLED; } struct Qdisc_class_ops { @@ -331,8 +350,8 @@ extern struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, struct Qdisc_ops *ops); extern struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue, struct Qdisc_ops *ops, u32 parentid); -extern void qdisc_calculate_pkt_len(struct sk_buff *skb, - struct qdisc_size_table *stab); +extern void __qdisc_calculate_pkt_len(struct sk_buff *skb, + const struct qdisc_size_table *stab); extern void tcf_destroy(struct tcf_proto *tp); extern void tcf_destroy_chain(struct tcf_proto **fl); @@ -411,12 +430,20 @@ enum net_xmit_qdisc_t { #define net_xmit_drop_count(e) (1) #endif -static inline int qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch) +static inline void qdisc_calculate_pkt_len(struct sk_buff *skb, + const struct Qdisc *sch) { #ifdef CONFIG_NET_SCHED - if (sch->stab) - qdisc_calculate_pkt_len(skb, sch->stab); + struct qdisc_size_table *stab = rcu_dereference_bh(sch->stab); + + if (stab) + __qdisc_calculate_pkt_len(skb, stab); #endif +} + +static inline int qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch) +{ + qdisc_calculate_pkt_len(skb, sch); return sch->enqueue(skb, sch); } @@ -445,7 +472,6 @@ static inline int __qdisc_enqueue_tail(struct sk_buff *skb, struct Qdisc *sch, { __skb_queue_tail(list, skb); sch->qstats.backlog += qdisc_pkt_len(skb); - qdisc_bstats_update(sch, skb); return NET_XMIT_SUCCESS; } @@ -460,8 +486,10 @@ static inline struct sk_buff *__qdisc_dequeue_head(struct Qdisc *sch, { struct sk_buff *skb = __skb_dequeue(list); - if (likely(skb != NULL)) + if (likely(skb != NULL)) { sch->qstats.backlog -= qdisc_pkt_len(skb); + qdisc_bstats_update(sch, skb); + } return skb; } @@ -474,10 +502,11 @@ static inline struct sk_buff *qdisc_dequeue_head(struct Qdisc *sch) static inline unsigned int __qdisc_queue_drop_head(struct Qdisc *sch, struct sk_buff_head *list) { - struct sk_buff *skb = __qdisc_dequeue_head(sch, list); + struct sk_buff *skb = __skb_dequeue(list); if (likely(skb != NULL)) { unsigned int len = qdisc_pkt_len(skb); + sch->qstats.backlog -= len; kfree_skb(skb); return len; } diff --git a/include/net/sock.h b/include/net/sock.h index d884d268c70..ba6465bf7c7 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1189,7 +1189,7 @@ extern void sk_filter_release_rcu(struct rcu_head *rcu); static inline void sk_filter_release(struct sk_filter *fp) { if (atomic_dec_and_test(&fp->refcnt)) - call_rcu_bh(&fp->rcu, sk_filter_release_rcu); + call_rcu(&fp->rcu, sk_filter_release_rcu); } static inline void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp) diff --git a/kernel/audit.c b/kernel/audit.c index e4956244ae5..162e88e33bc 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -74,6 +74,8 @@ static int audit_initialized; int audit_enabled; int audit_ever_enabled; +EXPORT_SYMBOL_GPL(audit_enabled); + /* Default state when kernel boots without any parameters. */ static int audit_default; diff --git a/kernel/params.c b/kernel/params.c index 08107d18175..0da1411222b 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -719,9 +719,7 @@ void destroy_params(const struct kernel_param *params, unsigned num) params[i].ops->free(params[i].arg); } -static void __init kernel_add_sysfs_param(const char *name, - struct kernel_param *kparam, - unsigned int name_skip) +static struct module_kobject * __init locate_module_kobject(const char *name) { struct module_kobject *mk; struct kobject *kobj; @@ -729,10 +727,7 @@ static void __init kernel_add_sysfs_param(const char *name, kobj = kset_find_obj(module_kset, name); if (kobj) { - /* We already have one. Remove params so we can add more. */ mk = to_module_kobject(kobj); - /* We need to remove it before adding parameters. */ - sysfs_remove_group(&mk->kobj, &mk->mp->grp); } else { mk = kzalloc(sizeof(struct module_kobject), GFP_KERNEL); BUG_ON(!mk); @@ -743,15 +738,36 @@ static void __init kernel_add_sysfs_param(const char *name, "%s", name); if (err) { kobject_put(&mk->kobj); - printk(KERN_ERR "Module '%s' failed add to sysfs, " - "error number %d\n", name, err); - printk(KERN_ERR "The system will be unstable now.\n"); - return; + printk(KERN_ERR + "Module '%s' failed add to sysfs, error number %d\n", + name, err); + printk(KERN_ERR + "The system will be unstable now.\n"); + return NULL; } - /* So that exit path is even. */ + + /* So that we hold reference in both cases. */ kobject_get(&mk->kobj); } + return mk; +} + +static void __init kernel_add_sysfs_param(const char *name, + struct kernel_param *kparam, + unsigned int name_skip) +{ + struct module_kobject *mk; + int err; + + mk = locate_module_kobject(name); + if (!mk) + return; + + /* We need to remove old parameters before adding more. */ + if (mk->mp) + sysfs_remove_group(&mk->kobj, &mk->mp->grp); + /* These should not fail at boot. */ err = add_sysfs_param(mk, kparam, kparam->name + name_skip); BUG_ON(err); @@ -796,6 +812,32 @@ static void __init param_sysfs_builtin(void) } } +ssize_t __modver_version_show(struct module_attribute *mattr, + struct module *mod, char *buf) +{ + struct module_version_attribute *vattr = + container_of(mattr, struct module_version_attribute, mattr); + + return sprintf(buf, "%s\n", vattr->version); +} + +extern struct module_version_attribute __start___modver[], __stop___modver[]; + +static void __init version_sysfs_builtin(void) +{ + const struct module_version_attribute *vattr; + struct module_kobject *mk; + int err; + + for (vattr = __start___modver; vattr < __stop___modver; vattr++) { + mk = locate_module_kobject(vattr->module_name); + if (mk) { + err = sysfs_create_file(&mk->kobj, &vattr->mattr.attr); + kobject_uevent(&mk->kobj, KOBJ_ADD); + kobject_put(&mk->kobj); + } + } +} /* module-related sysfs stuff */ @@ -875,6 +917,7 @@ static int __init param_sysfs_init(void) } module_sysfs_initialized = 1; + version_sysfs_builtin(); param_sysfs_builtin(); return 0; diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 84522c79698..126a302c481 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -2201,13 +2201,6 @@ find_lively_task_by_vpid(pid_t vpid) if (!task) return ERR_PTR(-ESRCH); - /* - * Can't attach events to a dying task. - */ - err = -ESRCH; - if (task->flags & PF_EXITING) - goto errout; - /* Reuse ptrace permission checks for now. */ err = -EACCES; if (!ptrace_may_access(task, PTRACE_MODE_READ)) @@ -2268,14 +2261,27 @@ retry: get_ctx(ctx); - if (cmpxchg(&task->perf_event_ctxp[ctxn], NULL, ctx)) { - /* - * We raced with some other task; use - * the context they set. - */ + err = 0; + mutex_lock(&task->perf_event_mutex); + /* + * If it has already passed perf_event_exit_task(). + * we must see PF_EXITING, it takes this mutex too. + */ + if (task->flags & PF_EXITING) + err = -ESRCH; + else if (task->perf_event_ctxp[ctxn]) + err = -EAGAIN; + else + rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx); + mutex_unlock(&task->perf_event_mutex); + + if (unlikely(err)) { put_task_struct(task); kfree(ctx); - goto retry; + + if (err == -EAGAIN) + goto retry; + goto errout; } } @@ -5374,6 +5380,8 @@ free_dev: goto out; } +static struct lock_class_key cpuctx_mutex; + int perf_pmu_register(struct pmu *pmu, char *name, int type) { int cpu, ret; @@ -5422,6 +5430,7 @@ skip_type: cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); __perf_event_init_context(&cpuctx->ctx); + lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex); cpuctx->ctx.type = cpu_context; cpuctx->ctx.pmu = pmu; cpuctx->jiffies_interval = 1; @@ -6127,7 +6136,7 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) * scheduled, so we are now safe from rescheduling changing * our context. */ - child_ctx = child->perf_event_ctxp[ctxn]; + child_ctx = rcu_dereference_raw(child->perf_event_ctxp[ctxn]); task_ctx_sched_out(child_ctx, EVENT_ALL); /* @@ -6440,11 +6449,6 @@ int perf_event_init_context(struct task_struct *child, int ctxn) unsigned long flags; int ret = 0; - child->perf_event_ctxp[ctxn] = NULL; - - mutex_init(&child->perf_event_mutex); - INIT_LIST_HEAD(&child->perf_event_list); - if (likely(!parent->perf_event_ctxp[ctxn])) return 0; @@ -6533,6 +6537,10 @@ int perf_event_init_task(struct task_struct *child) { int ctxn, ret; + memset(child->perf_event_ctxp, 0, sizeof(child->perf_event_ctxp)); + mutex_init(&child->perf_event_mutex); + INIT_LIST_HEAD(&child->perf_event_list); + for_each_task_context_nr(ctxn) { ret = perf_event_init_context(child, ctxn); if (ret) diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 77e9166d7bb..354769979c0 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -699,7 +699,8 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) cfs_rq->nr_running--; } -#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED +#ifdef CONFIG_FAIR_GROUP_SCHED +# ifdef CONFIG_SMP static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq, int global_update) { @@ -762,6 +763,51 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) list_del_leaf_cfs_rq(cfs_rq); } +static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg, + long weight_delta) +{ + long load_weight, load, shares; + + load = cfs_rq->load.weight + weight_delta; + + load_weight = atomic_read(&tg->load_weight); + load_weight -= cfs_rq->load_contribution; + load_weight += load; + + shares = (tg->shares * load); + if (load_weight) + shares /= load_weight; + + if (shares < MIN_SHARES) + shares = MIN_SHARES; + if (shares > tg->shares) + shares = tg->shares; + + return shares; +} + +static void update_entity_shares_tick(struct cfs_rq *cfs_rq) +{ + if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) { + update_cfs_load(cfs_rq, 0); + update_cfs_shares(cfs_rq, 0); + } +} +# else /* CONFIG_SMP */ +static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) +{ +} + +static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg, + long weight_delta) +{ + return tg->shares; +} + +static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq) +{ +} +# endif /* CONFIG_SMP */ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, unsigned long weight) { @@ -782,7 +828,7 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta) { struct task_group *tg; struct sched_entity *se; - long load_weight, load, shares; + long shares; if (!cfs_rq) return; @@ -791,32 +837,14 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta) se = tg->se[cpu_of(rq_of(cfs_rq))]; if (!se) return; - - load = cfs_rq->load.weight + weight_delta; - - load_weight = atomic_read(&tg->load_weight); - load_weight -= cfs_rq->load_contribution; - load_weight += load; - - shares = (tg->shares * load); - if (load_weight) - shares /= load_weight; - - if (shares < MIN_SHARES) - shares = MIN_SHARES; - if (shares > tg->shares) - shares = tg->shares; +#ifndef CONFIG_SMP + if (likely(se->load.weight == tg->shares)) + return; +#endif + shares = calc_cfs_shares(cfs_rq, tg, weight_delta); reweight_entity(cfs_rq_of(se), se, shares); } - -static void update_entity_shares_tick(struct cfs_rq *cfs_rq) -{ - if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) { - update_cfs_load(cfs_rq, 0); - update_cfs_shares(cfs_rq, 0); - } -} #else /* CONFIG_FAIR_GROUP_SCHED */ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) { diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 3e216e01bbd..c55ea243347 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -642,8 +642,7 @@ static void tick_nohz_switch_to_nohz(void) } local_irq_enable(); - printk(KERN_INFO "Switched to NOHz mode on CPU #%d\n", - smp_processor_id()); + printk(KERN_INFO "Switched to NOHz mode on CPU #%d\n", smp_processor_id()); } /* @@ -795,8 +794,10 @@ void tick_setup_sched_timer(void) } #ifdef CONFIG_NO_HZ - if (tick_nohz_enabled) + if (tick_nohz_enabled) { ts->nohz_mode = NOHZ_MODE_HIGHRES; + printk(KERN_INFO "Switched to NOHz mode on CPU #%d\n", smp_processor_id()); + } #endif } #endif /* HIGH_RES_TIMERS */ diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c index 17c5ba7551a..29a54ccd213 100644 --- a/net/9p/trans_rdma.c +++ b/net/9p/trans_rdma.c @@ -59,7 +59,6 @@ * safely advertise a maxsize * of 64k */ -#define P9_RDMA_MAX_SGE (P9_RDMA_MAXSIZE >> PAGE_SHIFT) /** * struct p9_trans_rdma - RDMA transport instance * diff --git a/net/bridge/netfilter/ebt_ip6.c b/net/bridge/netfilter/ebt_ip6.c index 50a46afc2bc..2ed0056a39a 100644 --- a/net/bridge/netfilter/ebt_ip6.c +++ b/net/bridge/netfilter/ebt_ip6.c @@ -22,9 +22,15 @@ #include <linux/netfilter_bridge/ebtables.h> #include <linux/netfilter_bridge/ebt_ip6.h> -struct tcpudphdr { - __be16 src; - __be16 dst; +union pkthdr { + struct { + __be16 src; + __be16 dst; + } tcpudphdr; + struct { + u8 type; + u8 code; + } icmphdr; }; static bool @@ -33,8 +39,8 @@ ebt_ip6_mt(const struct sk_buff *skb, struct xt_action_param *par) const struct ebt_ip6_info *info = par->matchinfo; const struct ipv6hdr *ih6; struct ipv6hdr _ip6h; - const struct tcpudphdr *pptr; - struct tcpudphdr _ports; + const union pkthdr *pptr; + union pkthdr _pkthdr; ih6 = skb_header_pointer(skb, 0, sizeof(_ip6h), &_ip6h); if (ih6 == NULL) @@ -56,26 +62,34 @@ ebt_ip6_mt(const struct sk_buff *skb, struct xt_action_param *par) return false; if (FWINV(info->protocol != nexthdr, EBT_IP6_PROTO)) return false; - if (!(info->bitmask & EBT_IP6_DPORT) && - !(info->bitmask & EBT_IP6_SPORT)) + if (!(info->bitmask & ( EBT_IP6_DPORT | + EBT_IP6_SPORT | EBT_IP6_ICMP6))) return true; - pptr = skb_header_pointer(skb, offset_ph, sizeof(_ports), - &_ports); + + /* min icmpv6 headersize is 4, so sizeof(_pkthdr) is ok. */ + pptr = skb_header_pointer(skb, offset_ph, sizeof(_pkthdr), + &_pkthdr); if (pptr == NULL) return false; if (info->bitmask & EBT_IP6_DPORT) { - u32 dst = ntohs(pptr->dst); + u16 dst = ntohs(pptr->tcpudphdr.dst); if (FWINV(dst < info->dport[0] || dst > info->dport[1], EBT_IP6_DPORT)) return false; } if (info->bitmask & EBT_IP6_SPORT) { - u32 src = ntohs(pptr->src); + u16 src = ntohs(pptr->tcpudphdr.src); if (FWINV(src < info->sport[0] || src > info->sport[1], EBT_IP6_SPORT)) return false; } - return true; + if ((info->bitmask & EBT_IP6_ICMP6) && + FWINV(pptr->icmphdr.type < info->icmpv6_type[0] || + pptr->icmphdr.type > info->icmpv6_type[1] || + pptr->icmphdr.code < info->icmpv6_code[0] || + pptr->icmphdr.code > info->icmpv6_code[1], + EBT_IP6_ICMP6)) + return false; } return true; } @@ -103,6 +117,14 @@ static int ebt_ip6_mt_check(const struct xt_mtchk_param *par) return -EINVAL; if (info->bitmask & EBT_IP6_SPORT && info->sport[0] > info->sport[1]) return -EINVAL; + if (info->bitmask & EBT_IP6_ICMP6) { + if ((info->invflags & EBT_IP6_PROTO) || + info->protocol != IPPROTO_ICMPV6) + return -EINVAL; + if (info->icmpv6_type[0] > info->icmpv6_type[1] || + info->icmpv6_code[0] > info->icmpv6_code[1]) + return -EINVAL; + } return 0; } diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 16df0532d4b..5f1825df9dc 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -1764,6 +1764,7 @@ static int compat_table_info(const struct ebt_table_info *info, newinfo->entries_size = size; + xt_compat_init_offsets(AF_INET, info->nentries); return EBT_ENTRY_ITERATE(entries, size, compat_calc_entry, info, entries, newinfo); } diff --git a/net/caif/cfcnfg.c b/net/caif/cfcnfg.c index c665de778b6..f1f98d967d8 100644 --- a/net/caif/cfcnfg.c +++ b/net/caif/cfcnfg.c @@ -23,10 +23,8 @@ #include <asm/atomic.h> #define MAX_PHY_LAYERS 7 -#define PHY_NAME_LEN 20 #define container_obj(layr) container_of(layr, struct cfcnfg, layer) -#define RFM_FRAGMENT_SIZE 4030 /* Information about CAIF physical interfaces held by Config Module in order * to manage physical interfaces diff --git a/net/caif/cfdgml.c b/net/caif/cfdgml.c index d3ed264ad6c..27dab26ad3b 100644 --- a/net/caif/cfdgml.c +++ b/net/caif/cfdgml.c @@ -18,7 +18,6 @@ #define DGM_CMD_BIT 0x80 #define DGM_FLOW_OFF 0x81 #define DGM_FLOW_ON 0x80 -#define DGM_CTRL_PKT_SIZE 1 #define DGM_MTU 1500 static int cfdgml_receive(struct cflayer *layr, struct cfpkt *pkt); diff --git a/net/caif/cfserl.c b/net/caif/cfserl.c index 9297f7dea9d..8303fe3ebf8 100644 --- a/net/caif/cfserl.c +++ b/net/caif/cfserl.c @@ -25,7 +25,6 @@ struct cfserl { spinlock_t sync; bool usestx; }; -#define STXLEN(layr) (layr->usestx ? 1 : 0) static int cfserl_receive(struct cflayer *layr, struct cfpkt *pkt); static int cfserl_transmit(struct cflayer *layr, struct cfpkt *pkt); diff --git a/net/caif/cfutill.c b/net/caif/cfutill.c index efad410e4c8..315c0d60136 100644 --- a/net/caif/cfutill.c +++ b/net/caif/cfutill.c @@ -20,7 +20,7 @@ #define UTIL_REMOTE_SHUTDOWN 0x82 #define UTIL_FLOW_OFF 0x81 #define UTIL_FLOW_ON 0x80 -#define UTIL_CTRL_PKT_SIZE 1 + static int cfutill_receive(struct cflayer *layr, struct cfpkt *pkt); static int cfutill_transmit(struct cflayer *layr, struct cfpkt *pkt); diff --git a/net/caif/cfveil.c b/net/caif/cfveil.c index 3b425b189a9..c3b1dec4acf 100644 --- a/net/caif/cfveil.c +++ b/net/caif/cfveil.c @@ -17,7 +17,7 @@ #define VEI_FLOW_OFF 0x81 #define VEI_FLOW_ON 0x80 #define VEI_SET_PIN 0x82 -#define VEI_CTRL_PKT_SIZE 1 + #define container_obj(layr) container_of(layr, struct cfsrvl, layer) static int cfvei_receive(struct cflayer *layr, struct cfpkt *pkt); diff --git a/net/core/dev.c b/net/core/dev.c index 7c6a46f8037..d162ba8d622 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -749,7 +749,8 @@ EXPORT_SYMBOL(dev_get_by_index); * @ha: hardware address * * Search for an interface by MAC address. Returns NULL if the device - * is not found or a pointer to the device. The caller must hold RCU + * is not found or a pointer to the device. + * The caller must hold RCU or RTNL. * The returned device has not had its ref count increased * and the caller must therefore be careful about locking * @@ -1285,7 +1286,7 @@ static int __dev_close(struct net_device *dev) return __dev_close_many(&single); } -int dev_close_many(struct list_head *head) +static int dev_close_many(struct list_head *head) { struct net_device *dev, *tmp; LIST_HEAD(tmp_list); @@ -1593,6 +1594,48 @@ static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) rcu_read_unlock(); } +/* netif_setup_tc - Handle tc mappings on real_num_tx_queues change + * @dev: Network device + * @txq: number of queues available + * + * If real_num_tx_queues is changed the tc mappings may no longer be + * valid. To resolve this verify the tc mapping remains valid and if + * not NULL the mapping. With no priorities mapping to this + * offset/count pair it will no longer be used. In the worst case TC0 + * is invalid nothing can be done so disable priority mappings. If is + * expected that drivers will fix this mapping if they can before + * calling netif_set_real_num_tx_queues. + */ +static void netif_setup_tc(struct net_device *dev, unsigned int txq) +{ + int i; + struct netdev_tc_txq *tc = &dev->tc_to_txq[0]; + + /* If TC0 is invalidated disable TC mapping */ + if (tc->offset + tc->count > txq) { + pr_warning("Number of in use tx queues changed " + "invalidating tc mappings. Priority " + "traffic classification disabled!\n"); + dev->num_tc = 0; + return; + } + + /* Invalidated prio to tc mappings set to TC0 */ + for (i = 1; i < TC_BITMASK + 1; i++) { + int q = netdev_get_prio_tc_map(dev, i); + + tc = &dev->tc_to_txq[q]; + if (tc->offset + tc->count > txq) { + pr_warning("Number of in use tx queues " + "changed. Priority %i to tc " + "mapping %i is no longer valid " + "setting map to 0\n", + i, q); + netdev_set_prio_tc_map(dev, i, 0); + } + } +} + /* * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues * greater then real_num_tx_queues stale skbs on the qdisc must be flushed. @@ -1612,6 +1655,9 @@ int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq) if (rc) return rc; + if (dev->num_tc) + netif_setup_tc(dev, txq); + if (txq < dev->real_num_tx_queues) qdisc_reset_all_tx_gt(dev, txq); } @@ -2161,6 +2207,8 @@ u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb, unsigned int num_tx_queues) { u32 hash; + u16 qoffset = 0; + u16 qcount = num_tx_queues; if (skb_rx_queue_recorded(skb)) { hash = skb_get_rx_queue(skb); @@ -2169,13 +2217,19 @@ u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb, return hash; } + if (dev->num_tc) { + u8 tc = netdev_get_prio_tc_map(dev, skb->priority); + qoffset = dev->tc_to_txq[tc].offset; + qcount = dev->tc_to_txq[tc].count; + } + if (skb->sk && skb->sk->sk_hash) hash = skb->sk->sk_hash; else hash = (__force u16) skb->protocol ^ skb->rxhash; hash = jhash_1word(hash, hashrnd); - return (u16) (((u64) hash * num_tx_queues) >> 32); + return (u16) (((u64) hash * qcount) >> 32) + qoffset; } EXPORT_SYMBOL(__skb_tx_hash); @@ -2272,15 +2326,18 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, struct netdev_queue *txq) { spinlock_t *root_lock = qdisc_lock(q); - bool contended = qdisc_is_running(q); + bool contended; int rc; + qdisc_skb_cb(skb)->pkt_len = skb->len; + qdisc_calculate_pkt_len(skb, q); /* * Heuristic to force contended enqueues to serialize on a * separate lock before trying to get qdisc main lock. * This permits __QDISC_STATE_RUNNING owner to get the lock more often * and dequeue packets faster. */ + contended = qdisc_is_running(q); if (unlikely(contended)) spin_lock(&q->busylock); @@ -2298,7 +2355,6 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE)) skb_dst_force(skb); - qdisc_skb_cb(skb)->pkt_len = skb->len; qdisc_bstats_update(q, skb); if (sch_direct_xmit(skb, q, dev, txq, root_lock)) { @@ -2313,7 +2369,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, rc = NET_XMIT_SUCCESS; } else { skb_dst_force(skb); - rc = qdisc_enqueue_root(skb, q); + rc = q->enqueue(skb, q) & NET_XMIT_MASK; if (qdisc_run_begin(q)) { if (unlikely(contended)) { spin_unlock(&q->busylock); @@ -4572,6 +4628,17 @@ int dev_set_mtu(struct net_device *dev, int new_mtu) EXPORT_SYMBOL(dev_set_mtu); /** + * dev_set_group - Change group this device belongs to + * @dev: device + * @new_group: group this device should belong to + */ +void dev_set_group(struct net_device *dev, int new_group) +{ + dev->group = new_group; +} +EXPORT_SYMBOL(dev_set_group); + +/** * dev_set_mac_address - Change Media Access Control Address * @dev: device * @sa: new address @@ -5678,6 +5745,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, dev->priv_flags = IFF_XMIT_DST_RELEASE; setup(dev); strcpy(dev->name, name); + dev->group = INIT_NETDEV_GROUP; return dev; free_pcpu: diff --git a/net/core/filter.c b/net/core/filter.c index afc58374ca9..232b1873bb2 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -142,14 +142,14 @@ int sk_filter(struct sock *sk, struct sk_buff *skb) if (err) return err; - rcu_read_lock_bh(); - filter = rcu_dereference_bh(sk->sk_filter); + rcu_read_lock(); + filter = rcu_dereference(sk->sk_filter); if (filter) { unsigned int pkt_len = sk_run_filter(skb, filter->insns); err = pkt_len ? pskb_trim(skb, pkt_len) : -EPERM; } - rcu_read_unlock_bh(); + rcu_read_unlock(); return err; } diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 60a90291342..799f06e03a2 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -316,7 +316,7 @@ static struct neigh_hash_table *neigh_hash_alloc(unsigned int entries) { size_t size = entries * sizeof(struct neighbour *); struct neigh_hash_table *ret; - struct neighbour **buckets; + struct neighbour __rcu **buckets; ret = kmalloc(sizeof(*ret), GFP_ATOMIC); if (!ret) @@ -324,14 +324,14 @@ static struct neigh_hash_table *neigh_hash_alloc(unsigned int entries) if (size <= PAGE_SIZE) buckets = kzalloc(size, GFP_ATOMIC); else - buckets = (struct neighbour **) + buckets = (struct neighbour __rcu **) __get_free_pages(GFP_ATOMIC | __GFP_ZERO, get_order(size)); if (!buckets) { kfree(ret); return NULL; } - rcu_assign_pointer(ret->hash_buckets, buckets); + ret->hash_buckets = buckets; ret->hash_mask = entries - 1; get_random_bytes(&ret->hash_rnd, sizeof(ret->hash_rnd)); return ret; @@ -343,7 +343,7 @@ static void neigh_hash_free_rcu(struct rcu_head *head) struct neigh_hash_table, rcu); size_t size = (nht->hash_mask + 1) * sizeof(struct neighbour *); - struct neighbour **buckets = nht->hash_buckets; + struct neighbour __rcu **buckets = nht->hash_buckets; if (size <= PAGE_SIZE) kfree(buckets); @@ -1540,7 +1540,7 @@ void neigh_table_init_no_netlink(struct neigh_table *tbl) panic("cannot create neighbour proc dir entry"); #endif - tbl->nht = neigh_hash_alloc(8); + RCU_INIT_POINTER(tbl->nht, neigh_hash_alloc(8)); phsize = (PNEIGH_HASHMASK + 1) * sizeof(struct pneigh_entry *); tbl->phash_buckets = kzalloc(phsize, GFP_KERNEL); @@ -1602,7 +1602,8 @@ int neigh_table_clear(struct neigh_table *tbl) } write_unlock(&neigh_tbl_lock); - call_rcu(&tbl->nht->rcu, neigh_hash_free_rcu); + call_rcu(&rcu_dereference_protected(tbl->nht, 1)->rcu, + neigh_hash_free_rcu); tbl->nht = NULL; kfree(tbl->phash_buckets); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 750db57f3bb..c668f8c371b 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -868,6 +868,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, netif_running(dev) ? dev->operstate : IF_OPER_DOWN); NLA_PUT_U8(skb, IFLA_LINKMODE, dev->link_mode); NLA_PUT_U32(skb, IFLA_MTU, dev->mtu); + NLA_PUT_U32(skb, IFLA_GROUP, dev->group); if (dev->ifindex != dev->iflink) NLA_PUT_U32(skb, IFLA_LINK, dev->iflink); @@ -1265,6 +1266,11 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm, modified = 1; } + if (tb[IFLA_GROUP]) { + dev_set_group(dev, nla_get_u32(tb[IFLA_GROUP])); + modified = 1; + } + /* * Interface selected by interface index but interface * name provided implies that a name change has been @@ -1542,6 +1548,8 @@ struct net_device *rtnl_create_link(struct net *src_net, struct net *net, set_operstate(dev, nla_get_u8(tb[IFLA_OPERSTATE])); if (tb[IFLA_LINKMODE]) dev->link_mode = nla_get_u8(tb[IFLA_LINKMODE]); + if (tb[IFLA_GROUP]) + dev_set_group(dev, nla_get_u32(tb[IFLA_GROUP])); return dev; @@ -1552,6 +1560,24 @@ err: } EXPORT_SYMBOL(rtnl_create_link); +static int rtnl_group_changelink(struct net *net, int group, + struct ifinfomsg *ifm, + struct nlattr **tb) +{ + struct net_device *dev; + int err; + + for_each_netdev(net, dev) { + if (dev->group == group) { + err = do_setlink(dev, ifm, tb, NULL, 0); + if (err < 0) + return err; + } + } + + return 0; +} + static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) { struct net *net = sock_net(skb->sk); @@ -1579,10 +1605,12 @@ replay: ifm = nlmsg_data(nlh); if (ifm->ifi_index > 0) dev = __dev_get_by_index(net, ifm->ifi_index); - else if (ifname[0]) - dev = __dev_get_by_name(net, ifname); - else - dev = NULL; + else { + if (ifname[0]) + dev = __dev_get_by_name(net, ifname); + else + dev = NULL; + } err = validate_linkmsg(dev, tb); if (err < 0) @@ -1646,8 +1674,13 @@ replay: return do_setlink(dev, ifm, tb, ifname, modified); } - if (!(nlh->nlmsg_flags & NLM_F_CREATE)) + if (!(nlh->nlmsg_flags & NLM_F_CREATE)) { + if (ifm->ifi_index == 0 && tb[IFLA_GROUP]) + return rtnl_group_changelink(net, + nla_get_u32(tb[IFLA_GROUP]), + ifm, tb); return -ENODEV; + } if (ifm->ifi_index) return -EOPNOTSUPP; diff --git a/net/decnet/dn_table.c b/net/decnet/dn_table.c index f2abd375569..b66600b3f4b 100644 --- a/net/decnet/dn_table.c +++ b/net/decnet/dn_table.c @@ -59,7 +59,6 @@ struct dn_hash }; #define dz_key_0(key) ((key).datum = 0) -#define dz_prefix(key,dz) ((key).datum) #define for_nexthops(fi) { int nhsel; const struct dn_fib_nh *nh;\ for(nhsel = 0, nh = (fi)->fib_nh; nhsel < (fi)->fib_nhs; nh++, nhsel++) diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 0c877a74e1f..3fb14b7c13c 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -428,7 +428,7 @@ static void __exit dsa_cleanup_module(void) } module_exit(dsa_cleanup_module); -MODULE_AUTHOR("Lennert Buytenhek <buytenh@wantstofly.org>") +MODULE_AUTHOR("Lennert Buytenhek <buytenh@wantstofly.org>"); MODULE_DESCRIPTION("Driver for Distributed Switch Architecture switch chips"); MODULE_LICENSE("GPL"); MODULE_ALIAS("platform:dsa"); diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index a5a1050595d..8949a05ac30 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -140,6 +140,9 @@ config IP_ROUTE_VERBOSE handled by the klogd daemon which is responsible for kernel messages ("man klogd"). +config IP_ROUTE_CLASSID + bool + config IP_PNP bool "IP: kernel level autoconfiguration" help @@ -657,4 +660,3 @@ config TCP_MD5SIG on the Internet. If unsure, say N. - diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 04c8b69fd42..7927589813b 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -1017,14 +1017,13 @@ static int arp_req_set_proxy(struct net *net, struct net_device *dev, int on) IPV4_DEVCONF_ALL(net, PROXY_ARP) = on; return 0; } - if (__in_dev_get_rcu(dev)) { - IN_DEV_CONF_SET(__in_dev_get_rcu(dev), PROXY_ARP, on); + if (__in_dev_get_rtnl(dev)) { + IN_DEV_CONF_SET(__in_dev_get_rtnl(dev), PROXY_ARP, on); return 0; } return -ENXIO; } -/* must be called with rcu_read_lock() */ static int arp_req_set_public(struct net *net, struct arpreq *r, struct net_device *dev) { @@ -1233,10 +1232,10 @@ int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg) if (!(r.arp_flags & ATF_NETMASK)) ((struct sockaddr_in *)&r.arp_netmask)->sin_addr.s_addr = htonl(0xFFFFFFFFUL); - rcu_read_lock(); + rtnl_lock(); if (r.arp_dev[0]) { err = -ENODEV; - dev = dev_get_by_name_rcu(net, r.arp_dev); + dev = __dev_get_by_name(net, r.arp_dev); if (dev == NULL) goto out; @@ -1263,7 +1262,7 @@ int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg) break; } out: - rcu_read_unlock(); + rtnl_unlock(); if (cmd == SIOCGARP && !err && copy_to_user(arg, &r, sizeof(r))) err = -EFAULT; return err; diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index 7981a24f5c7..9cefe72029c 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -41,12 +41,12 @@ struct fib4_rule { __be32 srcmask; __be32 dst; __be32 dstmask; -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID u32 tclassid; #endif }; -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID u32 fib_rules_tclass(struct fib_result *res) { return res->r ? ((struct fib4_rule *) res->r)->tclassid : 0; @@ -165,7 +165,7 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb, if (frh->dst_len) rule4->dst = nla_get_be32(tb[FRA_DST]); -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID if (tb[FRA_FLOW]) rule4->tclassid = nla_get_u32(tb[FRA_FLOW]); #endif @@ -195,7 +195,7 @@ static int fib4_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh, if (frh->tos && (rule4->tos != frh->tos)) return 0; -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID if (tb[FRA_FLOW] && (rule4->tclassid != nla_get_u32(tb[FRA_FLOW]))) return 0; #endif @@ -224,7 +224,7 @@ static int fib4_rule_fill(struct fib_rule *rule, struct sk_buff *skb, if (rule4->src_len) NLA_PUT_BE32(skb, FRA_SRC, rule4->src); -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID if (rule4->tclassid) NLA_PUT_U32(skb, FRA_FLOW, rule4->tclassid); #endif diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 12d3dc3df1b..9aff11d7278 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -200,7 +200,7 @@ static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi) #ifdef CONFIG_IP_ROUTE_MULTIPATH nh->nh_weight != onh->nh_weight || #endif -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID nh->nh_tclassid != onh->nh_tclassid || #endif ((nh->nh_flags ^ onh->nh_flags) & ~RTNH_F_DEAD)) @@ -422,7 +422,7 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh, nla = nla_find(attrs, attrlen, RTA_GATEWAY); nexthop_nh->nh_gw = nla ? nla_get_be32(nla) : 0; -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID nla = nla_find(attrs, attrlen, RTA_FLOW); nexthop_nh->nh_tclassid = nla ? nla_get_u32(nla) : 0; #endif @@ -476,7 +476,7 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi) nla = nla_find(attrs, attrlen, RTA_GATEWAY); if (nla && nla_get_be32(nla) != nh->nh_gw) return 1; -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID nla = nla_find(attrs, attrlen, RTA_FLOW); if (nla && nla_get_u32(nla) != nh->nh_tclassid) return 1; @@ -779,7 +779,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg) goto err_inval; if (cfg->fc_gw && fi->fib_nh->nh_gw != cfg->fc_gw) goto err_inval; -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow) goto err_inval; #endif @@ -792,7 +792,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg) nh->nh_oif = cfg->fc_oif; nh->nh_gw = cfg->fc_gw; nh->nh_flags = cfg->fc_flags; -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID nh->nh_tclassid = cfg->fc_flow; #endif #ifdef CONFIG_IP_ROUTE_MULTIPATH @@ -1002,7 +1002,7 @@ int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, if (fi->fib_nh->nh_oif) NLA_PUT_U32(skb, RTA_OIF, fi->fib_nh->nh_oif); -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID if (fi->fib_nh[0].nh_tclassid) NLA_PUT_U32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid); #endif @@ -1027,7 +1027,7 @@ int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, if (nh->nh_gw) NLA_PUT_BE32(skb, RTA_GATEWAY, nh->nh_gw); -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID if (nh->nh_tclassid) NLA_PUT_U32(skb, RTA_FLOW, nh->nh_tclassid); #endif diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index d859bcc26cb..d7b2b0987a3 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -340,7 +340,7 @@ static int ip_rcv_finish(struct sk_buff *skb) } } -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID if (unlikely(skb_dst(skb)->tclassid)) { struct ip_rt_acct *st = this_cpu_ptr(ip_rt_acct); u32 idx = skb_dst(skb)->tclassid; diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index babd1a2bae5..f926a310075 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -206,8 +206,9 @@ config IP_NF_TARGET_REDIRECT config NF_NAT_SNMP_BASIC tristate "Basic SNMP-ALG support" - depends on NF_NAT + depends on NF_CONNTRACK_SNMP && NF_NAT depends on NETFILTER_ADVANCED + default NF_NAT && NF_CONNTRACK_SNMP ---help--- This module implements an Application Layer Gateway (ALG) for diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index e855fffaed9..e95054c690c 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -866,6 +866,7 @@ static int compat_table_info(const struct xt_table_info *info, memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); newinfo->initial_entries = 0; loc_cpu_entry = info->entries[raw_smp_processor_id()]; + xt_compat_init_offsets(NFPROTO_ARP, info->number); xt_entry_foreach(iter, loc_cpu_entry, info->size) { ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo); if (ret != 0) @@ -1333,6 +1334,7 @@ static int translate_compat_table(const char *name, duprintf("translate_compat_table: size %u\n", info->size); j = 0; xt_compat_lock(NFPROTO_ARP); + xt_compat_init_offsets(NFPROTO_ARP, number); /* Walk through entries, checking offsets. */ xt_entry_foreach(iter0, entry0, total_size) { ret = check_compat_entry_size_and_hooks(iter0, info, &size, diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 652efea013d..ef7d7b9680e 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -1063,6 +1063,7 @@ static int compat_table_info(const struct xt_table_info *info, memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); newinfo->initial_entries = 0; loc_cpu_entry = info->entries[raw_smp_processor_id()]; + xt_compat_init_offsets(AF_INET, info->number); xt_entry_foreach(iter, loc_cpu_entry, info->size) { ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo); if (ret != 0) @@ -1664,6 +1665,7 @@ translate_compat_table(struct net *net, duprintf("translate_compat_table: size %u\n", info->size); j = 0; xt_compat_lock(AF_INET); + xt_compat_init_offsets(AF_INET, number); /* Walk through entries, checking offsets. */ xt_entry_foreach(iter0, entry0, total_size) { ret = check_compat_entry_size_and_hooks(iter0, info, &size, diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index 1e26a489765..403ca57f601 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -300,13 +300,8 @@ clusterip_tg(struct sk_buff *skb, const struct xt_action_param *par) * that the ->target() function isn't called after ->destroy() */ ct = nf_ct_get(skb, &ctinfo); - if (ct == NULL) { - pr_info("no conntrack!\n"); - /* FIXME: need to drop invalid ones, since replies - * to outgoing connections of other nodes will be - * marked as INVALID */ + if (ct == NULL) return NF_DROP; - } /* special case: ICMP error handling. conntrack distinguishes between * error messages (RELATED) and information requests (see below) */ diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c index 72ffc8fda2e..d76d6c9ed94 100644 --- a/net/ipv4/netfilter/ipt_LOG.c +++ b/net/ipv4/netfilter/ipt_LOG.c @@ -442,8 +442,7 @@ ipt_log_packet(u_int8_t pf, } #endif - /* MAC logging for input path only. */ - if (in && !out) + if (in != NULL) dump_mac_header(m, loginfo, skb); dump_packet(m, loginfo, skb, 0); diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c index 294a2a32f29..aef5d1fbe77 100644 --- a/net/ipv4/netfilter/iptable_mangle.c +++ b/net/ipv4/netfilter/iptable_mangle.c @@ -60,7 +60,7 @@ ipt_mangle_out(struct sk_buff *skb, const struct net_device *out) ret = ipt_do_table(skb, NF_INET_LOCAL_OUT, NULL, out, dev_net(out)->ipv4.iptable_mangle); /* Reroute for ANY change. */ - if (ret != NF_DROP && ret != NF_STOLEN && ret != NF_QUEUE) { + if (ret != NF_DROP && ret != NF_STOLEN) { iph = ip_hdr(skb); if (iph->saddr != saddr || diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c index 63f60fc5d26..5585980fce2 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c @@ -20,6 +20,7 @@ #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_expect.h> #include <net/netfilter/nf_conntrack_acct.h> +#include <linux/rculist_nulls.h> struct ct_iter_state { struct seq_net_private p; @@ -35,7 +36,8 @@ static struct hlist_nulls_node *ct_get_first(struct seq_file *seq) for (st->bucket = 0; st->bucket < net->ct.htable_size; st->bucket++) { - n = rcu_dereference(net->ct.hash[st->bucket].first); + n = rcu_dereference( + hlist_nulls_first_rcu(&net->ct.hash[st->bucket])); if (!is_a_nulls(n)) return n; } @@ -48,13 +50,14 @@ static struct hlist_nulls_node *ct_get_next(struct seq_file *seq, struct net *net = seq_file_net(seq); struct ct_iter_state *st = seq->private; - head = rcu_dereference(head->next); + head = rcu_dereference(hlist_nulls_next_rcu(head)); while (is_a_nulls(head)) { if (likely(get_nulls_value(head) == st->bucket)) { if (++st->bucket >= net->ct.htable_size) return NULL; } - head = rcu_dereference(net->ct.hash[st->bucket].first); + head = rcu_dereference( + hlist_nulls_first_rcu(&net->ct.hash[st->bucket])); } return head; } @@ -217,7 +220,8 @@ static struct hlist_node *ct_expect_get_first(struct seq_file *seq) struct hlist_node *n; for (st->bucket = 0; st->bucket < nf_ct_expect_hsize; st->bucket++) { - n = rcu_dereference(net->ct.expect_hash[st->bucket].first); + n = rcu_dereference( + hlist_first_rcu(&net->ct.expect_hash[st->bucket])); if (n) return n; } @@ -230,11 +234,12 @@ static struct hlist_node *ct_expect_get_next(struct seq_file *seq, struct net *net = seq_file_net(seq); struct ct_expect_iter_state *st = seq->private; - head = rcu_dereference(head->next); + head = rcu_dereference(hlist_next_rcu(head)); while (head == NULL) { if (++st->bucket >= nf_ct_expect_hsize) return NULL; - head = rcu_dereference(net->ct.expect_hash[st->bucket].first); + head = rcu_dereference( + hlist_first_rcu(&net->ct.expect_hash[st->bucket])); } return head; } diff --git a/net/ipv4/netfilter/nf_nat_amanda.c b/net/ipv4/netfilter/nf_nat_amanda.c index 0f23b3f06df..703f366fd23 100644 --- a/net/ipv4/netfilter/nf_nat_amanda.c +++ b/net/ipv4/netfilter/nf_nat_amanda.c @@ -44,13 +44,13 @@ static unsigned int help(struct sk_buff *skb, /* Try to get same port: if not, try to change it. */ for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) { - int ret; + int res; exp->tuple.dst.u.tcp.port = htons(port); - ret = nf_ct_expect_related(exp); - if (ret == 0) + res = nf_ct_expect_related(exp); + if (res == 0) break; - else if (ret != -EBUSY) { + else if (res != -EBUSY) { port = 0; break; } diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c index c04787ce1a7..21bcf471b25 100644 --- a/net/ipv4/netfilter/nf_nat_core.c +++ b/net/ipv4/netfilter/nf_nat_core.c @@ -221,7 +221,14 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple, manips not an issue. */ if (maniptype == IP_NAT_MANIP_SRC && !(range->flags & IP_NAT_RANGE_PROTO_RANDOM)) { - if (find_appropriate_src(net, zone, orig_tuple, tuple, range)) { + /* try the original tuple first */ + if (in_range(orig_tuple, range)) { + if (!nf_nat_used_tuple(orig_tuple, ct)) { + *tuple = *orig_tuple; + return; + } + } else if (find_appropriate_src(net, zone, orig_tuple, tuple, + range)) { pr_debug("get_unique_tuple: Found current src map\n"); if (!nf_nat_used_tuple(tuple, ct)) return; @@ -266,7 +273,6 @@ nf_nat_setup_info(struct nf_conn *ct, struct net *net = nf_ct_net(ct); struct nf_conntrack_tuple curr_tuple, new_tuple; struct nf_conn_nat *nat; - int have_to_hash = !(ct->status & IPS_NAT_DONE_MASK); /* nat helper or nfctnetlink also setup binding */ nat = nfct_nat(ct); @@ -306,8 +312,7 @@ nf_nat_setup_info(struct nf_conn *ct, ct->status |= IPS_DST_NAT; } - /* Place in source hash if this is the first time. */ - if (have_to_hash) { + if (maniptype == IP_NAT_MANIP_SRC) { unsigned int srchash; srchash = hash_by_src(net, nf_ct_zone(ct), @@ -323,9 +328,9 @@ nf_nat_setup_info(struct nf_conn *ct, /* It's done. */ if (maniptype == IP_NAT_MANIP_DST) - set_bit(IPS_DST_NAT_DONE_BIT, &ct->status); + ct->status |= IPS_DST_NAT_DONE; else - set_bit(IPS_SRC_NAT_DONE_BIT, &ct->status); + ct->status |= IPS_SRC_NAT_DONE; return NF_ACCEPT; } @@ -502,7 +507,10 @@ int nf_nat_protocol_register(const struct nf_nat_protocol *proto) int ret = 0; spin_lock_bh(&nf_nat_lock); - if (nf_nat_protos[proto->protonum] != &nf_nat_unknown_protocol) { + if (rcu_dereference_protected( + nf_nat_protos[proto->protonum], + lockdep_is_held(&nf_nat_lock) + ) != &nf_nat_unknown_protocol) { ret = -EBUSY; goto out; } @@ -532,7 +540,7 @@ static void nf_nat_cleanup_conntrack(struct nf_conn *ct) if (nat == NULL || nat->ct == NULL) return; - NF_CT_ASSERT(nat->ct->status & IPS_NAT_DONE_MASK); + NF_CT_ASSERT(nat->ct->status & IPS_SRC_NAT_DONE); spin_lock_bh(&nf_nat_lock); hlist_del_rcu(&nat->bysource); @@ -545,11 +553,10 @@ static void nf_nat_move_storage(void *new, void *old) struct nf_conn_nat *old_nat = old; struct nf_conn *ct = old_nat->ct; - if (!ct || !(ct->status & IPS_NAT_DONE_MASK)) + if (!ct || !(ct->status & IPS_SRC_NAT_DONE)) return; spin_lock_bh(&nf_nat_lock); - new_nat->ct = ct; hlist_replace_rcu(&old_nat->bysource, &new_nat->bysource); spin_unlock_bh(&nf_nat_lock); } @@ -679,8 +686,7 @@ static int __net_init nf_nat_net_init(struct net *net) { /* Leave them the same for the moment. */ net->ipv4.nat_htable_size = net->ct.htable_size; - net->ipv4.nat_bysource = nf_ct_alloc_hashtable(&net->ipv4.nat_htable_size, - &net->ipv4.nat_vmalloced, 0); + net->ipv4.nat_bysource = nf_ct_alloc_hashtable(&net->ipv4.nat_htable_size, 0); if (!net->ipv4.nat_bysource) return -ENOMEM; return 0; @@ -702,8 +708,7 @@ static void __net_exit nf_nat_net_exit(struct net *net) { nf_ct_iterate_cleanup(net, &clean_nat, NULL); synchronize_rcu(); - nf_ct_free_hashtable(net->ipv4.nat_bysource, net->ipv4.nat_vmalloced, - net->ipv4.nat_htable_size); + nf_ct_free_hashtable(net->ipv4.nat_bysource, net->ipv4.nat_htable_size); } static struct pernet_operations nf_nat_net_ops = { diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c index ee5f419d0a5..8812a02078a 100644 --- a/net/ipv4/netfilter/nf_nat_snmp_basic.c +++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c @@ -54,6 +54,7 @@ #include <net/netfilter/nf_conntrack_expect.h> #include <net/netfilter/nf_conntrack_helper.h> #include <net/netfilter/nf_nat_helper.h> +#include <linux/netfilter/nf_conntrack_snmp.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>"); @@ -1310,9 +1311,9 @@ static int __init nf_nat_snmp_basic_init(void) { int ret = 0; - ret = nf_conntrack_helper_register(&snmp_helper); - if (ret < 0) - return ret; + BUG_ON(nf_nat_snmp_hook != NULL); + rcu_assign_pointer(nf_nat_snmp_hook, help); + ret = nf_conntrack_helper_register(&snmp_trap_helper); if (ret < 0) { nf_conntrack_helper_unregister(&snmp_helper); @@ -1323,7 +1324,7 @@ static int __init nf_nat_snmp_basic_init(void) static void __exit nf_nat_snmp_basic_fini(void) { - nf_conntrack_helper_unregister(&snmp_helper); + rcu_assign_pointer(nf_nat_snmp_hook, NULL); nf_conntrack_helper_unregister(&snmp_trap_helper); } diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 351dc4e8524..3e5b7cc2db4 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -514,7 +514,7 @@ static const struct file_operations rt_cpu_seq_fops = { .release = seq_release, }; -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID static int rt_acct_proc_show(struct seq_file *m, void *v) { struct ip_rt_acct *dst, *src; @@ -567,14 +567,14 @@ static int __net_init ip_rt_do_proc_init(struct net *net) if (!pde) goto err2; -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops); if (!pde) goto err3; #endif return 0; -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID err3: remove_proc_entry("rt_cache", net->proc_net_stat); #endif @@ -588,7 +588,7 @@ static void __net_exit ip_rt_do_proc_exit(struct net *net) { remove_proc_entry("rt_cache", net->proc_net_stat); remove_proc_entry("rt_cache", net->proc_net); -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID remove_proc_entry("rt_acct", net->proc_net); #endif } @@ -1775,7 +1775,7 @@ void ip_rt_get_source(u8 *addr, struct rtable *rt) memcpy(addr, &src, 4); } -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID static void set_class_tag(struct rtable *rt, u32 tag) { if (!(rt->dst.tclassid & 0xFFFF)) @@ -1825,7 +1825,7 @@ static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag) FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) rt->rt_gateway = FIB_RES_GW(*res); dst_import_metrics(dst, fi->fib_metrics); -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID dst->tclassid = FIB_RES_NH(*res).nh_tclassid; #endif } @@ -1835,7 +1835,7 @@ static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag) if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40) dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40); -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID #ifdef CONFIG_IP_MULTIPLE_TABLES set_class_tag(rt, fib_rules_tclass(res)); #endif @@ -1891,7 +1891,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, rth->fl.mark = skb->mark; rth->fl.fl4_src = saddr; rth->rt_src = saddr; -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID rth->dst.tclassid = itag; #endif rth->rt_iif = @@ -2208,7 +2208,7 @@ local_input: rth->fl.mark = skb->mark; rth->fl.fl4_src = saddr; rth->rt_src = saddr; -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID rth->dst.tclassid = itag; #endif rth->rt_iif = @@ -2828,7 +2828,7 @@ static int rt_fill_info(struct net *net, } if (rt->dst.dev) NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex); -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID if (rt->dst.tclassid) NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid); #endif @@ -3249,9 +3249,9 @@ static __net_initdata struct pernet_operations rt_genid_ops = { }; -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID struct ip_rt_acct __percpu *ip_rt_acct __read_mostly; -#endif /* CONFIG_NET_CLS_ROUTE */ +#endif /* CONFIG_IP_ROUTE_CLASSID */ static __initdata unsigned long rhash_entries; static int __init set_rhash_entries(char *str) @@ -3267,7 +3267,7 @@ int __init ip_rt_init(void) { int rc = 0; -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct)); if (!ip_rt_acct) panic("IP: failed to allocate ip_rt_acct\n"); diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 7d227c644f7..47b7b8df7fa 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -1076,6 +1076,7 @@ static int compat_table_info(const struct xt_table_info *info, memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); newinfo->initial_entries = 0; loc_cpu_entry = info->entries[raw_smp_processor_id()]; + xt_compat_init_offsets(AF_INET6, info->number); xt_entry_foreach(iter, loc_cpu_entry, info->size) { ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo); if (ret != 0) @@ -1679,6 +1680,7 @@ translate_compat_table(struct net *net, duprintf("translate_compat_table: size %u\n", info->size); j = 0; xt_compat_lock(AF_INET6); + xt_compat_init_offsets(AF_INET6, number); /* Walk through entries, checking offsets. */ xt_entry_foreach(iter0, entry0, total_size) { ret = check_compat_entry_size_and_hooks(iter0, info, &size, diff --git a/net/ipv6/netfilter/ip6t_LOG.c b/net/ipv6/netfilter/ip6t_LOG.c index 09c88891a75..05027b75372 100644 --- a/net/ipv6/netfilter/ip6t_LOG.c +++ b/net/ipv6/netfilter/ip6t_LOG.c @@ -452,8 +452,7 @@ ip6t_log_packet(u_int8_t pf, in ? in->name : "", out ? out->name : ""); - /* MAC logging for input path only. */ - if (in && !out) + if (in != NULL) dump_mac_header(m, loginfo, skb); dump_packet(m, loginfo, skb, skb_network_offset(skb), 1); diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index 79d43aa8fa8..08572726381 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -45,6 +45,7 @@ #include <linux/netfilter_ipv6.h> #include <linux/kernel.h> #include <linux/module.h> +#include <net/netfilter/ipv6/nf_defrag_ipv6.h> struct nf_ct_frag6_skb_cb @@ -73,7 +74,7 @@ static struct inet_frags nf_frags; static struct netns_frags nf_init_frags; #ifdef CONFIG_SYSCTL -struct ctl_table nf_ct_frag6_sysctl_table[] = { +static struct ctl_table nf_ct_frag6_sysctl_table[] = { { .procname = "nf_conntrack_frag6_timeout", .data = &nf_init_frags.timeout, diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 86c39526ba5..2bc6cd7bb8e 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -123,18 +123,18 @@ static __inline__ int icmpv6_filter(struct sock *sk, struct sk_buff *skb) } #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) -static int (*mh_filter)(struct sock *sock, struct sk_buff *skb); +typedef int mh_filter_t(struct sock *sock, struct sk_buff *skb); -int rawv6_mh_filter_register(int (*filter)(struct sock *sock, - struct sk_buff *skb)) +static mh_filter_t __rcu *mh_filter __read_mostly; + +int rawv6_mh_filter_register(mh_filter_t filter) { rcu_assign_pointer(mh_filter, filter); return 0; } EXPORT_SYMBOL(rawv6_mh_filter_register); -int rawv6_mh_filter_unregister(int (*filter)(struct sock *sock, - struct sk_buff *skb)) +int rawv6_mh_filter_unregister(mh_filter_t filter) { rcu_assign_pointer(mh_filter, NULL); synchronize_rcu(); @@ -192,10 +192,10 @@ static int ipv6_raw_deliver(struct sk_buff *skb, int nexthdr) * policy is placed in rawv6_rcv() because it is * required for each socket. */ - int (*filter)(struct sock *sock, struct sk_buff *skb); + mh_filter_t *filter; filter = rcu_dereference(mh_filter); - filtered = filter ? filter(sk, skb) : 0; + filtered = filter ? (*filter)(sk, skb) : 0; break; } #endif diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 8ce38f10a54..b1599a345c1 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -412,7 +412,7 @@ static void prl_list_destroy_rcu(struct rcu_head *head) p = container_of(head, struct ip_tunnel_prl_entry, rcu_head); do { - n = p->next; + n = rcu_dereference_protected(p->next, 1); kfree(p); p = n; } while (p); @@ -421,15 +421,17 @@ static void prl_list_destroy_rcu(struct rcu_head *head) static int ipip6_tunnel_del_prl(struct ip_tunnel *t, struct ip_tunnel_prl *a) { - struct ip_tunnel_prl_entry *x, **p; + struct ip_tunnel_prl_entry *x; + struct ip_tunnel_prl_entry __rcu **p; int err = 0; ASSERT_RTNL(); if (a && a->addr != htonl(INADDR_ANY)) { - for (p = &t->prl; *p; p = &(*p)->next) { - if ((*p)->addr == a->addr) { - x = *p; + for (p = &t->prl; + (x = rtnl_dereference(*p)) != NULL; + p = &x->next) { + if (x->addr == a->addr) { *p = x->next; call_rcu(&x->rcu_head, prl_entry_destroy_rcu); t->prl_count--; @@ -438,9 +440,9 @@ ipip6_tunnel_del_prl(struct ip_tunnel *t, struct ip_tunnel_prl *a) } err = -ENXIO; } else { - if (t->prl) { + x = rtnl_dereference(t->prl); + if (x) { t->prl_count = 0; - x = t->prl; call_rcu(&x->rcu_head, prl_list_destroy_rcu); t->prl = NULL; } @@ -1179,7 +1181,7 @@ static int __net_init ipip6_fb_tunnel_init(struct net_device *dev) if (!dev->tstats) return -ENOMEM; dev_hold(dev); - sitn->tunnels_wc[0] = tunnel; + rcu_assign_pointer(sitn->tunnels_wc[0], tunnel); return 0; } @@ -1196,11 +1198,12 @@ static void __net_exit sit_destroy_tunnels(struct sit_net *sitn, struct list_hea for (prio = 1; prio < 4; prio++) { int h; for (h = 0; h < HASH_SIZE; h++) { - struct ip_tunnel *t = sitn->tunnels[prio][h]; + struct ip_tunnel *t; + t = rtnl_dereference(sitn->tunnels[prio][h]); while (t != NULL) { unregister_netdevice_queue(t->dev, head); - t = t->next; + t = rtnl_dereference(t->next); } } } diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 1534f2b44ca..faf7412ea45 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -85,6 +85,17 @@ config NF_CONNTRACK_EVENTS If unsure, say `N'. +config NF_CONNTRACK_TIMESTAMP + bool 'Connection tracking timestamping' + depends on NETFILTER_ADVANCED + help + This option enables support for connection tracking timestamping. + This allows you to store the flow start-time and to obtain + the flow-stop time (once it has been destroyed) via Connection + tracking events. + + If unsure, say `N'. + config NF_CT_PROTO_DCCP tristate 'DCCP protocol connection tracking support (EXPERIMENTAL)' depends on EXPERIMENTAL @@ -185,9 +196,13 @@ config NF_CONNTRACK_IRC To compile it as a module, choose M here. If unsure, say N. +config NF_CONNTRACK_BROADCAST + tristate + config NF_CONNTRACK_NETBIOS_NS tristate "NetBIOS name service protocol support" depends on NETFILTER_ADVANCED + select NF_CONNTRACK_BROADCAST help NetBIOS name service requests are sent as broadcast messages from an unprivileged port and responded to with unicast messages to the @@ -204,6 +219,21 @@ config NF_CONNTRACK_NETBIOS_NS To compile it as a module, choose M here. If unsure, say N. +config NF_CONNTRACK_SNMP + tristate "SNMP service protocol support" + depends on NETFILTER_ADVANCED + select NF_CONNTRACK_BROADCAST + help + SNMP service requests are sent as broadcast messages from an + unprivileged port and responded to with unicast messages to the + same port. This make them hard to firewall properly because connection + tracking doesn't deal with broadcasts. This helper tracks locally + originating SNMP service requests and the corresponding + responses. It relies on correct IP address configuration, specifically + netmask and broadcast address. + + To compile it as a module, choose M here. If unsure, say N. + config NF_CONNTRACK_PPTP tristate "PPtP protocol support" depends on NETFILTER_ADVANCED @@ -326,6 +356,16 @@ config NETFILTER_XT_CONNMARK comment "Xtables targets" +config NETFILTER_XT_TARGET_AUDIT + tristate "AUDIT target support" + depends on AUDIT + depends on NETFILTER_ADVANCED + ---help--- + This option adds a 'AUDIT' target, which can be used to create + audit records for packets dropped/accepted. + + To compileit as a module, choose M here. If unsure, say N. + config NETFILTER_XT_TARGET_CHECKSUM tristate "CHECKSUM target support" depends on IP_NF_MANGLE || IP6_NF_MANGLE @@ -477,6 +517,7 @@ config NETFILTER_XT_TARGET_NFLOG config NETFILTER_XT_TARGET_NFQUEUE tristate '"NFQUEUE" target Support' depends on NETFILTER_ADVANCED + select NETFILTER_NETLINK_QUEUE help This target replaced the old obsolete QUEUE target. @@ -886,7 +927,7 @@ config NETFILTER_XT_MATCH_RATEEST config NETFILTER_XT_MATCH_REALM tristate '"realm" match support' depends on NETFILTER_ADVANCED - select NET_CLS_ROUTE + select IP_ROUTE_CLASSID help This option adds a `realm' match, which allows you to use the realm key from the routing subsystem inside iptables. diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 441050f3111..9ae6878a85b 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -1,6 +1,7 @@ netfilter-objs := core.o nf_log.o nf_queue.o nf_sockopt.o nf_conntrack-y := nf_conntrack_core.o nf_conntrack_standalone.o nf_conntrack_expect.o nf_conntrack_helper.o nf_conntrack_proto.o nf_conntrack_l3proto_generic.o nf_conntrack_proto_generic.o nf_conntrack_proto_tcp.o nf_conntrack_proto_udp.o nf_conntrack_extend.o nf_conntrack_acct.o +nf_conntrack-$(CONFIG_NF_CONNTRACK_TIMESTAMP) += nf_conntrack_timestamp.o nf_conntrack-$(CONFIG_NF_CONNTRACK_EVENTS) += nf_conntrack_ecache.o obj-$(CONFIG_NETFILTER) = netfilter.o @@ -28,7 +29,9 @@ obj-$(CONFIG_NF_CONNTRACK_AMANDA) += nf_conntrack_amanda.o obj-$(CONFIG_NF_CONNTRACK_FTP) += nf_conntrack_ftp.o obj-$(CONFIG_NF_CONNTRACK_H323) += nf_conntrack_h323.o obj-$(CONFIG_NF_CONNTRACK_IRC) += nf_conntrack_irc.o +obj-$(CONFIG_NF_CONNTRACK_BROADCAST) += nf_conntrack_broadcast.o obj-$(CONFIG_NF_CONNTRACK_NETBIOS_NS) += nf_conntrack_netbios_ns.o +obj-$(CONFIG_NF_CONNTRACK_SNMP) += nf_conntrack_snmp.o obj-$(CONFIG_NF_CONNTRACK_PPTP) += nf_conntrack_pptp.o obj-$(CONFIG_NF_CONNTRACK_SANE) += nf_conntrack_sane.o obj-$(CONFIG_NF_CONNTRACK_SIP) += nf_conntrack_sip.o @@ -45,6 +48,7 @@ obj-$(CONFIG_NETFILTER_XT_MARK) += xt_mark.o obj-$(CONFIG_NETFILTER_XT_CONNMARK) += xt_connmark.o # targets +obj-$(CONFIG_NETFILTER_XT_TARGET_AUDIT) += xt_AUDIT.o obj-$(CONFIG_NETFILTER_XT_TARGET_CHECKSUM) += xt_CHECKSUM.o obj-$(CONFIG_NETFILTER_XT_TARGET_CLASSIFY) += xt_CLASSIFY.o obj-$(CONFIG_NETFILTER_XT_TARGET_CONNSECMARK) += xt_CONNSECMARK.o diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 32fcbe290c0..1e00bf7d27c 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -175,13 +175,21 @@ next_hook: ret = 1; } else if ((verdict & NF_VERDICT_MASK) == NF_DROP) { kfree_skb(skb); - ret = -(verdict >> NF_VERDICT_BITS); + ret = NF_DROP_GETERR(verdict); if (ret == 0) ret = -EPERM; } else if ((verdict & NF_VERDICT_MASK) == NF_QUEUE) { - if (!nf_queue(skb, elem, pf, hook, indev, outdev, okfn, - verdict >> NF_VERDICT_BITS)) - goto next_hook; + ret = nf_queue(skb, elem, pf, hook, indev, outdev, okfn, + verdict >> NF_VERDICT_QBITS); + if (ret < 0) { + if (ret == -ECANCELED) + goto next_hook; + if (ret == -ESRCH && + (verdict & NF_VERDICT_FLAG_QUEUE_BYPASS)) + goto next_hook; + kfree_skb(skb); + } + ret = 0; } rcu_read_unlock(); return ret; @@ -214,7 +222,7 @@ EXPORT_SYMBOL(skb_make_writable); /* This does not belong here, but locally generated errors need it if connection tracking in use: without this, connection may not be in hash table, and hence manufactured ICMP or RST packets will not be associated with it. */ -void (*ip_ct_attach)(struct sk_buff *, struct sk_buff *); +void (*ip_ct_attach)(struct sk_buff *, struct sk_buff *) __rcu __read_mostly; EXPORT_SYMBOL(ip_ct_attach); void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb) @@ -231,7 +239,7 @@ void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb) } EXPORT_SYMBOL(nf_ct_attach); -void (*nf_ct_destroy)(struct nf_conntrack *); +void (*nf_ct_destroy)(struct nf_conntrack *) __rcu __read_mostly; EXPORT_SYMBOL(nf_ct_destroy); void nf_conntrack_destroy(struct nf_conntrack *nfct) diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c index a475edee091..5c48ffb60c2 100644 --- a/net/netfilter/ipvs/ip_vs_app.c +++ b/net/netfilter/ipvs/ip_vs_app.c @@ -43,11 +43,6 @@ EXPORT_SYMBOL(register_ip_vs_app); EXPORT_SYMBOL(unregister_ip_vs_app); EXPORT_SYMBOL(register_ip_vs_app_inc); -/* ipvs application list head */ -static LIST_HEAD(ip_vs_app_list); -static DEFINE_MUTEX(__ip_vs_app_mutex); - - /* * Get an ip_vs_app object */ @@ -67,7 +62,8 @@ static inline void ip_vs_app_put(struct ip_vs_app *app) * Allocate/initialize app incarnation and register it in proto apps. */ static int -ip_vs_app_inc_new(struct ip_vs_app *app, __u16 proto, __u16 port) +ip_vs_app_inc_new(struct net *net, struct ip_vs_app *app, __u16 proto, + __u16 port) { struct ip_vs_protocol *pp; struct ip_vs_app *inc; @@ -98,7 +94,7 @@ ip_vs_app_inc_new(struct ip_vs_app *app, __u16 proto, __u16 port) } } - ret = pp->register_app(inc); + ret = pp->register_app(net, inc); if (ret) goto out; @@ -119,7 +115,7 @@ ip_vs_app_inc_new(struct ip_vs_app *app, __u16 proto, __u16 port) * Release app incarnation */ static void -ip_vs_app_inc_release(struct ip_vs_app *inc) +ip_vs_app_inc_release(struct net *net, struct ip_vs_app *inc) { struct ip_vs_protocol *pp; @@ -127,7 +123,7 @@ ip_vs_app_inc_release(struct ip_vs_app *inc) return; if (pp->unregister_app) - pp->unregister_app(inc); + pp->unregister_app(net, inc); IP_VS_DBG(9, "%s App %s:%u unregistered\n", pp->name, inc->name, ntohs(inc->port)); @@ -168,15 +164,17 @@ void ip_vs_app_inc_put(struct ip_vs_app *inc) * Register an application incarnation in protocol applications */ int -register_ip_vs_app_inc(struct ip_vs_app *app, __u16 proto, __u16 port) +register_ip_vs_app_inc(struct net *net, struct ip_vs_app *app, __u16 proto, + __u16 port) { + struct netns_ipvs *ipvs = net_ipvs(net); int result; - mutex_lock(&__ip_vs_app_mutex); + mutex_lock(&ipvs->app_mutex); - result = ip_vs_app_inc_new(app, proto, port); + result = ip_vs_app_inc_new(net, app, proto, port); - mutex_unlock(&__ip_vs_app_mutex); + mutex_unlock(&ipvs->app_mutex); return result; } @@ -185,16 +183,17 @@ register_ip_vs_app_inc(struct ip_vs_app *app, __u16 proto, __u16 port) /* * ip_vs_app registration routine */ -int register_ip_vs_app(struct ip_vs_app *app) +int register_ip_vs_app(struct net *net, struct ip_vs_app *app) { + struct netns_ipvs *ipvs = net_ipvs(net); /* increase the module use count */ ip_vs_use_count_inc(); - mutex_lock(&__ip_vs_app_mutex); + mutex_lock(&ipvs->app_mutex); - list_add(&app->a_list, &ip_vs_app_list); + list_add(&app->a_list, &ipvs->app_list); - mutex_unlock(&__ip_vs_app_mutex); + mutex_unlock(&ipvs->app_mutex); return 0; } @@ -204,19 +203,20 @@ int register_ip_vs_app(struct ip_vs_app *app) * ip_vs_app unregistration routine * We are sure there are no app incarnations attached to services */ -void unregister_ip_vs_app(struct ip_vs_app *app) +void unregister_ip_vs_app(struct net *net, struct ip_vs_app *app) { + struct netns_ipvs *ipvs = net_ipvs(net); struct ip_vs_app *inc, *nxt; - mutex_lock(&__ip_vs_app_mutex); + mutex_lock(&ipvs->app_mutex); list_for_each_entry_safe(inc, nxt, &app->incs_list, a_list) { - ip_vs_app_inc_release(inc); + ip_vs_app_inc_release(net, inc); } list_del(&app->a_list); - mutex_unlock(&__ip_vs_app_mutex); + mutex_unlock(&ipvs->app_mutex); /* decrease the module use count */ ip_vs_use_count_dec(); @@ -226,7 +226,8 @@ void unregister_ip_vs_app(struct ip_vs_app *app) /* * Bind ip_vs_conn to its ip_vs_app (called by cp constructor) */ -int ip_vs_bind_app(struct ip_vs_conn *cp, struct ip_vs_protocol *pp) +int ip_vs_bind_app(struct ip_vs_conn *cp, + struct ip_vs_protocol *pp) { return pp->app_conn_bind(cp); } @@ -481,11 +482,11 @@ int ip_vs_app_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb) * /proc/net/ip_vs_app entry function */ -static struct ip_vs_app *ip_vs_app_idx(loff_t pos) +static struct ip_vs_app *ip_vs_app_idx(struct netns_ipvs *ipvs, loff_t pos) { struct ip_vs_app *app, *inc; - list_for_each_entry(app, &ip_vs_app_list, a_list) { + list_for_each_entry(app, &ipvs->app_list, a_list) { list_for_each_entry(inc, &app->incs_list, a_list) { if (pos-- == 0) return inc; @@ -497,19 +498,24 @@ static struct ip_vs_app *ip_vs_app_idx(loff_t pos) static void *ip_vs_app_seq_start(struct seq_file *seq, loff_t *pos) { - mutex_lock(&__ip_vs_app_mutex); + struct net *net = seq_file_net(seq); + struct netns_ipvs *ipvs = net_ipvs(net); - return *pos ? ip_vs_app_idx(*pos - 1) : SEQ_START_TOKEN; + mutex_lock(&ipvs->app_mutex); + + return *pos ? ip_vs_app_idx(ipvs, *pos - 1) : SEQ_START_TOKEN; } static void *ip_vs_app_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct ip_vs_app *inc, *app; struct list_head *e; + struct net *net = seq_file_net(seq); + struct netns_ipvs *ipvs = net_ipvs(net); ++*pos; if (v == SEQ_START_TOKEN) - return ip_vs_app_idx(0); + return ip_vs_app_idx(ipvs, 0); inc = v; app = inc->app; @@ -518,7 +524,7 @@ static void *ip_vs_app_seq_next(struct seq_file *seq, void *v, loff_t *pos) return list_entry(e, struct ip_vs_app, a_list); /* go on to next application */ - for (e = app->a_list.next; e != &ip_vs_app_list; e = e->next) { + for (e = app->a_list.next; e != &ipvs->app_list; e = e->next) { app = list_entry(e, struct ip_vs_app, a_list); list_for_each_entry(inc, &app->incs_list, a_list) { return inc; @@ -529,7 +535,9 @@ static void *ip_vs_app_seq_next(struct seq_file *seq, void *v, loff_t *pos) static void ip_vs_app_seq_stop(struct seq_file *seq, void *v) { - mutex_unlock(&__ip_vs_app_mutex); + struct netns_ipvs *ipvs = net_ipvs(seq_file_net(seq)); + + mutex_unlock(&ipvs->app_mutex); } static int ip_vs_app_seq_show(struct seq_file *seq, void *v) @@ -557,7 +565,8 @@ static const struct seq_operations ip_vs_app_seq_ops = { static int ip_vs_app_open(struct inode *inode, struct file *file) { - return seq_open(file, &ip_vs_app_seq_ops); + return seq_open_net(inode, file, &ip_vs_app_seq_ops, + sizeof(struct seq_net_private)); } static const struct file_operations ip_vs_app_fops = { @@ -569,15 +578,36 @@ static const struct file_operations ip_vs_app_fops = { }; #endif -int __init ip_vs_app_init(void) +static int __net_init __ip_vs_app_init(struct net *net) { - /* we will replace it with proc_net_ipvs_create() soon */ - proc_net_fops_create(&init_net, "ip_vs_app", 0, &ip_vs_app_fops); + struct netns_ipvs *ipvs = net_ipvs(net); + + INIT_LIST_HEAD(&ipvs->app_list); + __mutex_init(&ipvs->app_mutex, "ipvs->app_mutex", &ipvs->app_key); + proc_net_fops_create(net, "ip_vs_app", 0, &ip_vs_app_fops); return 0; } +static void __net_exit __ip_vs_app_cleanup(struct net *net) +{ + proc_net_remove(net, "ip_vs_app"); +} + +static struct pernet_operations ip_vs_app_ops = { + .init = __ip_vs_app_init, + .exit = __ip_vs_app_cleanup, +}; + +int __init ip_vs_app_init(void) +{ + int rv; + + rv = register_pernet_subsys(&ip_vs_app_ops); + return rv; +} + void ip_vs_app_cleanup(void) { - proc_net_remove(&init_net, "ip_vs_app"); + unregister_pernet_subsys(&ip_vs_app_ops); } diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index e9adecdc8ca..83233fe24a0 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -48,35 +48,32 @@ /* * Connection hash size. Default is what was selected at compile time. */ -int ip_vs_conn_tab_bits = CONFIG_IP_VS_TAB_BITS; +static int ip_vs_conn_tab_bits = CONFIG_IP_VS_TAB_BITS; module_param_named(conn_tab_bits, ip_vs_conn_tab_bits, int, 0444); MODULE_PARM_DESC(conn_tab_bits, "Set connections' hash size"); /* size and mask values */ -int ip_vs_conn_tab_size; -int ip_vs_conn_tab_mask; +int ip_vs_conn_tab_size __read_mostly; +static int ip_vs_conn_tab_mask __read_mostly; /* * Connection hash table: for input and output packets lookups of IPVS */ -static struct list_head *ip_vs_conn_tab; +static struct list_head *ip_vs_conn_tab __read_mostly; /* SLAB cache for IPVS connections */ static struct kmem_cache *ip_vs_conn_cachep __read_mostly; -/* counter for current IPVS connections */ -static atomic_t ip_vs_conn_count = ATOMIC_INIT(0); - /* counter for no client port connections */ static atomic_t ip_vs_conn_no_cport_cnt = ATOMIC_INIT(0); /* random value for IPVS connection hash */ -static unsigned int ip_vs_conn_rnd; +static unsigned int ip_vs_conn_rnd __read_mostly; /* * Fine locking granularity for big connection hash table */ -#define CT_LOCKARRAY_BITS 4 +#define CT_LOCKARRAY_BITS 5 #define CT_LOCKARRAY_SIZE (1<<CT_LOCKARRAY_BITS) #define CT_LOCKARRAY_MASK (CT_LOCKARRAY_SIZE-1) @@ -133,19 +130,19 @@ static inline void ct_write_unlock_bh(unsigned key) /* * Returns hash value for IPVS connection entry */ -static unsigned int ip_vs_conn_hashkey(int af, unsigned proto, +static unsigned int ip_vs_conn_hashkey(struct net *net, int af, unsigned proto, const union nf_inet_addr *addr, __be16 port) { #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) - return jhash_3words(jhash(addr, 16, ip_vs_conn_rnd), - (__force u32)port, proto, ip_vs_conn_rnd) - & ip_vs_conn_tab_mask; + return (jhash_3words(jhash(addr, 16, ip_vs_conn_rnd), + (__force u32)port, proto, ip_vs_conn_rnd) ^ + ((size_t)net>>8)) & ip_vs_conn_tab_mask; #endif - return jhash_3words((__force u32)addr->ip, (__force u32)port, proto, - ip_vs_conn_rnd) - & ip_vs_conn_tab_mask; + return (jhash_3words((__force u32)addr->ip, (__force u32)port, proto, + ip_vs_conn_rnd) ^ + ((size_t)net>>8)) & ip_vs_conn_tab_mask; } static unsigned int ip_vs_conn_hashkey_param(const struct ip_vs_conn_param *p, @@ -166,18 +163,18 @@ static unsigned int ip_vs_conn_hashkey_param(const struct ip_vs_conn_param *p, port = p->vport; } - return ip_vs_conn_hashkey(p->af, p->protocol, addr, port); + return ip_vs_conn_hashkey(p->net, p->af, p->protocol, addr, port); } static unsigned int ip_vs_conn_hashkey_conn(const struct ip_vs_conn *cp) { struct ip_vs_conn_param p; - ip_vs_conn_fill_param(cp->af, cp->protocol, &cp->caddr, cp->cport, - NULL, 0, &p); + ip_vs_conn_fill_param(ip_vs_conn_net(cp), cp->af, cp->protocol, + &cp->caddr, cp->cport, NULL, 0, &p); - if (cp->dest && cp->dest->svc->pe) { - p.pe = cp->dest->svc->pe; + if (cp->pe) { + p.pe = cp->pe; p.pe_data = cp->pe_data; p.pe_data_len = cp->pe_data_len; } @@ -186,7 +183,7 @@ static unsigned int ip_vs_conn_hashkey_conn(const struct ip_vs_conn *cp) } /* - * Hashes ip_vs_conn in ip_vs_conn_tab by proto,addr,port. + * Hashes ip_vs_conn in ip_vs_conn_tab by netns,proto,addr,port. * returns bool success. */ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp) @@ -269,11 +266,12 @@ __ip_vs_conn_in_get(const struct ip_vs_conn_param *p) list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { if (cp->af == p->af && + p->cport == cp->cport && p->vport == cp->vport && ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) && ip_vs_addr_equal(p->af, p->vaddr, &cp->vaddr) && - p->cport == cp->cport && p->vport == cp->vport && ((!p->cport) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) && - p->protocol == cp->protocol) { + p->protocol == cp->protocol && + ip_vs_conn_net_eq(cp, p->net)) { /* HIT */ atomic_inc(&cp->refcnt); ct_read_unlock(hash); @@ -313,23 +311,23 @@ ip_vs_conn_fill_param_proto(int af, const struct sk_buff *skb, struct ip_vs_conn_param *p) { __be16 _ports[2], *pptr; + struct net *net = skb_net(skb); pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports); if (pptr == NULL) return 1; if (likely(!inverse)) - ip_vs_conn_fill_param(af, iph->protocol, &iph->saddr, pptr[0], - &iph->daddr, pptr[1], p); + ip_vs_conn_fill_param(net, af, iph->protocol, &iph->saddr, + pptr[0], &iph->daddr, pptr[1], p); else - ip_vs_conn_fill_param(af, iph->protocol, &iph->daddr, pptr[1], - &iph->saddr, pptr[0], p); + ip_vs_conn_fill_param(net, af, iph->protocol, &iph->daddr, + pptr[1], &iph->saddr, pptr[0], p); return 0; } struct ip_vs_conn * ip_vs_conn_in_get_proto(int af, const struct sk_buff *skb, - struct ip_vs_protocol *pp, const struct ip_vs_iphdr *iph, unsigned int proto_off, int inverse) { @@ -353,8 +351,10 @@ struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p) ct_read_lock(hash); list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { + if (!ip_vs_conn_net_eq(cp, p->net)) + continue; if (p->pe_data && p->pe->ct_match) { - if (p->pe->ct_match(p, cp)) + if (p->pe == cp->pe && p->pe->ct_match(p, cp)) goto out; continue; } @@ -404,10 +404,11 @@ struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p) list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { if (cp->af == p->af && + p->vport == cp->cport && p->cport == cp->dport && ip_vs_addr_equal(p->af, p->vaddr, &cp->caddr) && ip_vs_addr_equal(p->af, p->caddr, &cp->daddr) && - p->vport == cp->cport && p->cport == cp->dport && - p->protocol == cp->protocol) { + p->protocol == cp->protocol && + ip_vs_conn_net_eq(cp, p->net)) { /* HIT */ atomic_inc(&cp->refcnt); ret = cp; @@ -428,7 +429,6 @@ struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p) struct ip_vs_conn * ip_vs_conn_out_get_proto(int af, const struct sk_buff *skb, - struct ip_vs_protocol *pp, const struct ip_vs_iphdr *iph, unsigned int proto_off, int inverse) { @@ -611,9 +611,9 @@ struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp) struct ip_vs_dest *dest; if ((cp) && (!cp->dest)) { - dest = ip_vs_find_dest(cp->af, &cp->daddr, cp->dport, - &cp->vaddr, cp->vport, - cp->protocol); + dest = ip_vs_find_dest(ip_vs_conn_net(cp), cp->af, &cp->daddr, + cp->dport, &cp->vaddr, cp->vport, + cp->protocol, cp->fwmark); ip_vs_bind_dest(cp, dest); return dest; } else @@ -686,13 +686,14 @@ static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp) int ip_vs_check_template(struct ip_vs_conn *ct) { struct ip_vs_dest *dest = ct->dest; + struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(ct)); /* * Checking the dest server status. */ if ((dest == NULL) || !(dest->flags & IP_VS_DEST_F_AVAILABLE) || - (sysctl_ip_vs_expire_quiescent_template && + (ipvs->sysctl_expire_quiescent_template && (atomic_read(&dest->weight) == 0))) { IP_VS_DBG_BUF(9, "check_template: dest not available for " "protocol %s s:%s:%d v:%s:%d " @@ -730,6 +731,7 @@ int ip_vs_check_template(struct ip_vs_conn *ct) static void ip_vs_conn_expire(unsigned long data) { struct ip_vs_conn *cp = (struct ip_vs_conn *)data; + struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(cp)); cp->timeout = 60*HZ; @@ -765,13 +767,14 @@ static void ip_vs_conn_expire(unsigned long data) if (cp->flags & IP_VS_CONN_F_NFCT) ip_vs_conn_drop_conntrack(cp); + ip_vs_pe_put(cp->pe); kfree(cp->pe_data); if (unlikely(cp->app != NULL)) ip_vs_unbind_app(cp); ip_vs_unbind_dest(cp); if (cp->flags & IP_VS_CONN_F_NO_CPORT) atomic_dec(&ip_vs_conn_no_cport_cnt); - atomic_dec(&ip_vs_conn_count); + atomic_dec(&ipvs->conn_count); kmem_cache_free(ip_vs_conn_cachep, cp); return; @@ -802,10 +805,12 @@ void ip_vs_conn_expire_now(struct ip_vs_conn *cp) struct ip_vs_conn * ip_vs_conn_new(const struct ip_vs_conn_param *p, const union nf_inet_addr *daddr, __be16 dport, unsigned flags, - struct ip_vs_dest *dest) + struct ip_vs_dest *dest, __u32 fwmark) { struct ip_vs_conn *cp; - struct ip_vs_protocol *pp = ip_vs_proto_get(p->protocol); + struct netns_ipvs *ipvs = net_ipvs(p->net); + struct ip_vs_proto_data *pd = ip_vs_proto_data_get(p->net, + p->protocol); cp = kmem_cache_zalloc(ip_vs_conn_cachep, GFP_ATOMIC); if (cp == NULL) { @@ -815,6 +820,7 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p, INIT_LIST_HEAD(&cp->c_list); setup_timer(&cp->timer, ip_vs_conn_expire, (unsigned long)cp); + ip_vs_conn_net_set(cp, p->net); cp->af = p->af; cp->protocol = p->protocol; ip_vs_addr_copy(p->af, &cp->caddr, p->caddr); @@ -826,7 +832,10 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p, &cp->daddr, daddr); cp->dport = dport; cp->flags = flags; - if (flags & IP_VS_CONN_F_TEMPLATE && p->pe_data) { + cp->fwmark = fwmark; + if (flags & IP_VS_CONN_F_TEMPLATE && p->pe) { + ip_vs_pe_get(p->pe); + cp->pe = p->pe; cp->pe_data = p->pe_data; cp->pe_data_len = p->pe_data_len; } @@ -842,7 +851,7 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p, atomic_set(&cp->n_control, 0); atomic_set(&cp->in_pkts, 0); - atomic_inc(&ip_vs_conn_count); + atomic_inc(&ipvs->conn_count); if (flags & IP_VS_CONN_F_NO_CPORT) atomic_inc(&ip_vs_conn_no_cport_cnt); @@ -861,8 +870,8 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p, #endif ip_vs_bind_xmit(cp); - if (unlikely(pp && atomic_read(&pp->appcnt))) - ip_vs_bind_app(cp, pp); + if (unlikely(pd && atomic_read(&pd->appcnt))) + ip_vs_bind_app(cp, pd->pp); /* * Allow conntrack to be preserved. By default, conntrack @@ -871,7 +880,7 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p, * IP_VS_CONN_F_ONE_PACKET too. */ - if (ip_vs_conntrack_enabled()) + if (ip_vs_conntrack_enabled(ipvs)) cp->flags |= IP_VS_CONN_F_NFCT; /* Hash it in the ip_vs_conn_tab finally */ @@ -884,17 +893,22 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p, * /proc/net/ip_vs_conn entries */ #ifdef CONFIG_PROC_FS +struct ip_vs_iter_state { + struct seq_net_private p; + struct list_head *l; +}; static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos) { int idx; struct ip_vs_conn *cp; + struct ip_vs_iter_state *iter = seq->private; for (idx = 0; idx < ip_vs_conn_tab_size; idx++) { ct_read_lock_bh(idx); list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) { if (pos-- == 0) { - seq->private = &ip_vs_conn_tab[idx]; + iter->l = &ip_vs_conn_tab[idx]; return cp; } } @@ -906,14 +920,17 @@ static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos) static void *ip_vs_conn_seq_start(struct seq_file *seq, loff_t *pos) { - seq->private = NULL; + struct ip_vs_iter_state *iter = seq->private; + + iter->l = NULL; return *pos ? ip_vs_conn_array(seq, *pos - 1) :SEQ_START_TOKEN; } static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct ip_vs_conn *cp = v; - struct list_head *e, *l = seq->private; + struct ip_vs_iter_state *iter = seq->private; + struct list_head *e, *l = iter->l; int idx; ++*pos; @@ -930,18 +947,19 @@ static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos) while (++idx < ip_vs_conn_tab_size) { ct_read_lock_bh(idx); list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) { - seq->private = &ip_vs_conn_tab[idx]; + iter->l = &ip_vs_conn_tab[idx]; return cp; } ct_read_unlock_bh(idx); } - seq->private = NULL; + iter->l = NULL; return NULL; } static void ip_vs_conn_seq_stop(struct seq_file *seq, void *v) { - struct list_head *l = seq->private; + struct ip_vs_iter_state *iter = seq->private; + struct list_head *l = iter->l; if (l) ct_read_unlock_bh(l - ip_vs_conn_tab); @@ -955,18 +973,19 @@ static int ip_vs_conn_seq_show(struct seq_file *seq, void *v) "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Expires PEName PEData\n"); else { const struct ip_vs_conn *cp = v; + struct net *net = seq_file_net(seq); char pe_data[IP_VS_PENAME_MAXLEN + IP_VS_PEDATA_MAXLEN + 3]; size_t len = 0; - if (cp->dest && cp->pe_data && - cp->dest->svc->pe->show_pe_data) { + if (!ip_vs_conn_net_eq(cp, net)) + return 0; + if (cp->pe_data) { pe_data[0] = ' '; - len = strlen(cp->dest->svc->pe->name); - memcpy(pe_data + 1, cp->dest->svc->pe->name, len); + len = strlen(cp->pe->name); + memcpy(pe_data + 1, cp->pe->name, len); pe_data[len + 1] = ' '; len += 2; - len += cp->dest->svc->pe->show_pe_data(cp, - pe_data + len); + len += cp->pe->show_pe_data(cp, pe_data + len); } pe_data[len] = '\0'; @@ -1004,7 +1023,8 @@ static const struct seq_operations ip_vs_conn_seq_ops = { static int ip_vs_conn_open(struct inode *inode, struct file *file) { - return seq_open(file, &ip_vs_conn_seq_ops); + return seq_open_net(inode, file, &ip_vs_conn_seq_ops, + sizeof(struct ip_vs_iter_state)); } static const struct file_operations ip_vs_conn_fops = { @@ -1031,6 +1051,10 @@ static int ip_vs_conn_sync_seq_show(struct seq_file *seq, void *v) "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Origin Expires\n"); else { const struct ip_vs_conn *cp = v; + struct net *net = seq_file_net(seq); + + if (!ip_vs_conn_net_eq(cp, net)) + return 0; #ifdef CONFIG_IP_VS_IPV6 if (cp->af == AF_INET6) @@ -1067,7 +1091,8 @@ static const struct seq_operations ip_vs_conn_sync_seq_ops = { static int ip_vs_conn_sync_open(struct inode *inode, struct file *file) { - return seq_open(file, &ip_vs_conn_sync_seq_ops); + return seq_open_net(inode, file, &ip_vs_conn_sync_seq_ops, + sizeof(struct ip_vs_iter_state)); } static const struct file_operations ip_vs_conn_sync_fops = { @@ -1113,7 +1138,7 @@ static inline int todrop_entry(struct ip_vs_conn *cp) } /* Called from keventd and must protect itself from softirqs */ -void ip_vs_random_dropentry(void) +void ip_vs_random_dropentry(struct net *net) { int idx; struct ip_vs_conn *cp; @@ -1133,7 +1158,8 @@ void ip_vs_random_dropentry(void) if (cp->flags & IP_VS_CONN_F_TEMPLATE) /* connection template */ continue; - + if (!ip_vs_conn_net_eq(cp, net)) + continue; if (cp->protocol == IPPROTO_TCP) { switch(cp->state) { case IP_VS_TCP_S_SYN_RECV: @@ -1168,12 +1194,13 @@ void ip_vs_random_dropentry(void) /* * Flush all the connection entries in the ip_vs_conn_tab */ -static void ip_vs_conn_flush(void) +static void ip_vs_conn_flush(struct net *net) { int idx; struct ip_vs_conn *cp; + struct netns_ipvs *ipvs = net_ipvs(net); - flush_again: +flush_again: for (idx = 0; idx < ip_vs_conn_tab_size; idx++) { /* * Lock is actually needed in this loop. @@ -1181,7 +1208,8 @@ static void ip_vs_conn_flush(void) ct_write_lock_bh(idx); list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) { - + if (!ip_vs_conn_net_eq(cp, net)) + continue; IP_VS_DBG(4, "del connection\n"); ip_vs_conn_expire_now(cp); if (cp->control) { @@ -1194,16 +1222,41 @@ static void ip_vs_conn_flush(void) /* the counter may be not NULL, because maybe some conn entries are run by slow timer handler or unhashed but still referred */ - if (atomic_read(&ip_vs_conn_count) != 0) { + if (atomic_read(&ipvs->conn_count) != 0) { schedule(); goto flush_again; } } +/* + * per netns init and exit + */ +int __net_init __ip_vs_conn_init(struct net *net) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + + atomic_set(&ipvs->conn_count, 0); + + proc_net_fops_create(net, "ip_vs_conn", 0, &ip_vs_conn_fops); + proc_net_fops_create(net, "ip_vs_conn_sync", 0, &ip_vs_conn_sync_fops); + return 0; +} +static void __net_exit __ip_vs_conn_cleanup(struct net *net) +{ + /* flush all the connection entries first */ + ip_vs_conn_flush(net); + proc_net_remove(net, "ip_vs_conn"); + proc_net_remove(net, "ip_vs_conn_sync"); +} +static struct pernet_operations ipvs_conn_ops = { + .init = __ip_vs_conn_init, + .exit = __ip_vs_conn_cleanup, +}; int __init ip_vs_conn_init(void) { int idx; + int retc; /* Compute size and mask */ ip_vs_conn_tab_size = 1 << ip_vs_conn_tab_bits; @@ -1241,24 +1294,18 @@ int __init ip_vs_conn_init(void) rwlock_init(&__ip_vs_conntbl_lock_array[idx].l); } - proc_net_fops_create(&init_net, "ip_vs_conn", 0, &ip_vs_conn_fops); - proc_net_fops_create(&init_net, "ip_vs_conn_sync", 0, &ip_vs_conn_sync_fops); + retc = register_pernet_subsys(&ipvs_conn_ops); /* calculate the random value for connection hash */ get_random_bytes(&ip_vs_conn_rnd, sizeof(ip_vs_conn_rnd)); - return 0; + return retc; } - void ip_vs_conn_cleanup(void) { - /* flush all the connection entries first */ - ip_vs_conn_flush(); - + unregister_pernet_subsys(&ipvs_conn_ops); /* Release the empty cache */ kmem_cache_destroy(ip_vs_conn_cachep); - proc_net_remove(&init_net, "ip_vs_conn"); - proc_net_remove(&init_net, "ip_vs_conn_sync"); vfree(ip_vs_conn_tab); } diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index b4e51e9c5a0..f36a84f33ef 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -41,6 +41,7 @@ #include <net/icmp.h> /* for icmp_send */ #include <net/route.h> #include <net/ip6_checksum.h> +#include <net/netns/generic.h> /* net_generic() */ #include <linux/netfilter.h> #include <linux/netfilter_ipv4.h> @@ -68,6 +69,12 @@ EXPORT_SYMBOL(ip_vs_conn_put); EXPORT_SYMBOL(ip_vs_get_debug_level); #endif +int ip_vs_net_id __read_mostly; +#ifdef IP_VS_GENERIC_NETNS +EXPORT_SYMBOL(ip_vs_net_id); +#endif +/* netns cnt used for uniqueness */ +static atomic_t ipvs_netns_cnt = ATOMIC_INIT(0); /* ID used in ICMP lookups */ #define icmp_id(icmph) (((icmph)->un).echo.id) @@ -108,21 +115,28 @@ static inline void ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb) { struct ip_vs_dest *dest = cp->dest; + struct netns_ipvs *ipvs = net_ipvs(skb_net(skb)); + if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { - spin_lock(&dest->stats.lock); - dest->stats.ustats.inpkts++; - dest->stats.ustats.inbytes += skb->len; - spin_unlock(&dest->stats.lock); - - spin_lock(&dest->svc->stats.lock); - dest->svc->stats.ustats.inpkts++; - dest->svc->stats.ustats.inbytes += skb->len; - spin_unlock(&dest->svc->stats.lock); - - spin_lock(&ip_vs_stats.lock); - ip_vs_stats.ustats.inpkts++; - ip_vs_stats.ustats.inbytes += skb->len; - spin_unlock(&ip_vs_stats.lock); + struct ip_vs_cpu_stats *s; + + s = this_cpu_ptr(dest->stats.cpustats); + s->ustats.inpkts++; + u64_stats_update_begin(&s->syncp); + s->ustats.inbytes += skb->len; + u64_stats_update_end(&s->syncp); + + s = this_cpu_ptr(dest->svc->stats.cpustats); + s->ustats.inpkts++; + u64_stats_update_begin(&s->syncp); + s->ustats.inbytes += skb->len; + u64_stats_update_end(&s->syncp); + + s = this_cpu_ptr(ipvs->cpustats); + s->ustats.inpkts++; + u64_stats_update_begin(&s->syncp); + s->ustats.inbytes += skb->len; + u64_stats_update_end(&s->syncp); } } @@ -131,21 +145,28 @@ static inline void ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb) { struct ip_vs_dest *dest = cp->dest; + struct netns_ipvs *ipvs = net_ipvs(skb_net(skb)); + if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { - spin_lock(&dest->stats.lock); - dest->stats.ustats.outpkts++; - dest->stats.ustats.outbytes += skb->len; - spin_unlock(&dest->stats.lock); - - spin_lock(&dest->svc->stats.lock); - dest->svc->stats.ustats.outpkts++; - dest->svc->stats.ustats.outbytes += skb->len; - spin_unlock(&dest->svc->stats.lock); - - spin_lock(&ip_vs_stats.lock); - ip_vs_stats.ustats.outpkts++; - ip_vs_stats.ustats.outbytes += skb->len; - spin_unlock(&ip_vs_stats.lock); + struct ip_vs_cpu_stats *s; + + s = this_cpu_ptr(dest->stats.cpustats); + s->ustats.outpkts++; + u64_stats_update_begin(&s->syncp); + s->ustats.outbytes += skb->len; + u64_stats_update_end(&s->syncp); + + s = this_cpu_ptr(dest->svc->stats.cpustats); + s->ustats.outpkts++; + u64_stats_update_begin(&s->syncp); + s->ustats.outbytes += skb->len; + u64_stats_update_end(&s->syncp); + + s = this_cpu_ptr(ipvs->cpustats); + s->ustats.outpkts++; + u64_stats_update_begin(&s->syncp); + s->ustats.outbytes += skb->len; + u64_stats_update_end(&s->syncp); } } @@ -153,41 +174,44 @@ ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb) static inline void ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc) { - spin_lock(&cp->dest->stats.lock); - cp->dest->stats.ustats.conns++; - spin_unlock(&cp->dest->stats.lock); + struct netns_ipvs *ipvs = net_ipvs(svc->net); + struct ip_vs_cpu_stats *s; + + s = this_cpu_ptr(cp->dest->stats.cpustats); + s->ustats.conns++; - spin_lock(&svc->stats.lock); - svc->stats.ustats.conns++; - spin_unlock(&svc->stats.lock); + s = this_cpu_ptr(svc->stats.cpustats); + s->ustats.conns++; - spin_lock(&ip_vs_stats.lock); - ip_vs_stats.ustats.conns++; - spin_unlock(&ip_vs_stats.lock); + s = this_cpu_ptr(ipvs->cpustats); + s->ustats.conns++; } static inline int ip_vs_set_state(struct ip_vs_conn *cp, int direction, const struct sk_buff *skb, - struct ip_vs_protocol *pp) + struct ip_vs_proto_data *pd) { - if (unlikely(!pp->state_transition)) + if (unlikely(!pd->pp->state_transition)) return 0; - return pp->state_transition(cp, direction, skb, pp); + return pd->pp->state_transition(cp, direction, skb, pd); } -static inline void +static inline int ip_vs_conn_fill_param_persist(const struct ip_vs_service *svc, struct sk_buff *skb, int protocol, const union nf_inet_addr *caddr, __be16 cport, const union nf_inet_addr *vaddr, __be16 vport, struct ip_vs_conn_param *p) { - ip_vs_conn_fill_param(svc->af, protocol, caddr, cport, vaddr, vport, p); + ip_vs_conn_fill_param(svc->net, svc->af, protocol, caddr, cport, vaddr, + vport, p); p->pe = svc->pe; if (p->pe && p->pe->fill_param) - p->pe->fill_param(p, skb); + return p->pe->fill_param(p, skb); + + return 0; } /* @@ -200,7 +224,7 @@ ip_vs_conn_fill_param_persist(const struct ip_vs_service *svc, static struct ip_vs_conn * ip_vs_sched_persist(struct ip_vs_service *svc, struct sk_buff *skb, - __be16 ports[2]) + __be16 src_port, __be16 dst_port, int *ignored) { struct ip_vs_conn *cp = NULL; struct ip_vs_iphdr iph; @@ -224,8 +248,8 @@ ip_vs_sched_persist(struct ip_vs_service *svc, IP_VS_DBG_BUF(6, "p-schedule: src %s:%u dest %s:%u " "mnet %s\n", - IP_VS_DBG_ADDR(svc->af, &iph.saddr), ntohs(ports[0]), - IP_VS_DBG_ADDR(svc->af, &iph.daddr), ntohs(ports[1]), + IP_VS_DBG_ADDR(svc->af, &iph.saddr), ntohs(src_port), + IP_VS_DBG_ADDR(svc->af, &iph.daddr), ntohs(dst_port), IP_VS_DBG_ADDR(svc->af, &snet)); /* @@ -247,14 +271,14 @@ ip_vs_sched_persist(struct ip_vs_service *svc, const union nf_inet_addr fwmark = { .ip = htonl(svc->fwmark) }; __be16 vport = 0; - if (ports[1] == svc->port) { + if (dst_port == svc->port) { /* non-FTP template: * <protocol, caddr, 0, vaddr, vport, daddr, dport> * FTP template: * <protocol, caddr, 0, vaddr, 0, daddr, 0> */ if (svc->port != FTPPORT) - vport = ports[1]; + vport = dst_port; } else { /* Note: persistent fwmark-based services and * persistent port zero service are handled here. @@ -268,24 +292,31 @@ ip_vs_sched_persist(struct ip_vs_service *svc, vaddr = &fwmark; } } - ip_vs_conn_fill_param_persist(svc, skb, protocol, &snet, 0, - vaddr, vport, ¶m); + /* return *ignored = -1 so NF_DROP can be used */ + if (ip_vs_conn_fill_param_persist(svc, skb, protocol, &snet, 0, + vaddr, vport, ¶m) < 0) { + *ignored = -1; + return NULL; + } } /* Check if a template already exists */ ct = ip_vs_ct_in_get(¶m); if (!ct || !ip_vs_check_template(ct)) { - /* No template found or the dest of the connection + /* + * No template found or the dest of the connection * template is not available. + * return *ignored=0 i.e. ICMP and NF_DROP */ dest = svc->scheduler->schedule(svc, skb); if (!dest) { IP_VS_DBG(1, "p-schedule: no dest found.\n"); kfree(param.pe_data); + *ignored = 0; return NULL; } - if (ports[1] == svc->port && svc->port != FTPPORT) + if (dst_port == svc->port && svc->port != FTPPORT) dport = dest->port; /* Create a template @@ -293,9 +324,10 @@ ip_vs_sched_persist(struct ip_vs_service *svc, * and thus param.pe_data will be destroyed * when the template expires */ ct = ip_vs_conn_new(¶m, &dest->addr, dport, - IP_VS_CONN_F_TEMPLATE, dest); + IP_VS_CONN_F_TEMPLATE, dest, skb->mark); if (ct == NULL) { kfree(param.pe_data); + *ignored = -1; return NULL; } @@ -306,7 +338,7 @@ ip_vs_sched_persist(struct ip_vs_service *svc, kfree(param.pe_data); } - dport = ports[1]; + dport = dst_port; if (dport == svc->port && dest->port) dport = dest->port; @@ -317,11 +349,13 @@ ip_vs_sched_persist(struct ip_vs_service *svc, /* * Create a new connection according to the template */ - ip_vs_conn_fill_param(svc->af, iph.protocol, &iph.saddr, ports[0], - &iph.daddr, ports[1], ¶m); - cp = ip_vs_conn_new(¶m, &dest->addr, dport, flags, dest); + ip_vs_conn_fill_param(svc->net, svc->af, iph.protocol, &iph.saddr, + src_port, &iph.daddr, dst_port, ¶m); + + cp = ip_vs_conn_new(¶m, &dest->addr, dport, flags, dest, skb->mark); if (cp == NULL) { ip_vs_conn_put(ct); + *ignored = -1; return NULL; } @@ -341,11 +375,27 @@ ip_vs_sched_persist(struct ip_vs_service *svc, * It selects a server according to the virtual service, and * creates a connection entry. * Protocols supported: TCP, UDP + * + * Usage of *ignored + * + * 1 : protocol tried to schedule (eg. on SYN), found svc but the + * svc/scheduler decides that this packet should be accepted with + * NF_ACCEPT because it must not be scheduled. + * + * 0 : scheduler can not find destination, so try bypass or + * return ICMP and then NF_DROP (ip_vs_leave). + * + * -1 : scheduler tried to schedule but fatal error occurred, eg. + * ip_vs_conn_new failure (ENOMEM) or ip_vs_sip_fill_param + * failure such as missing Call-ID, ENOMEM on skb_linearize + * or pe_data. In this case we should return NF_DROP without + * any attempts to send ICMP with ip_vs_leave. */ struct ip_vs_conn * ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, - struct ip_vs_protocol *pp, int *ignored) + struct ip_vs_proto_data *pd, int *ignored) { + struct ip_vs_protocol *pp = pd->pp; struct ip_vs_conn *cp = NULL; struct ip_vs_iphdr iph; struct ip_vs_dest *dest; @@ -371,12 +421,10 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, } /* - * Do not schedule replies from local real server. It is risky - * for fwmark services but mostly for persistent services. + * Do not schedule replies from local real server. */ if ((!skb->dev || skb->dev->flags & IFF_LOOPBACK) && - (svc->flags & IP_VS_SVC_F_PERSISTENT || svc->fwmark) && - (cp = pp->conn_in_get(svc->af, skb, pp, &iph, iph.len, 1))) { + (cp = pp->conn_in_get(svc->af, skb, &iph, iph.len, 1))) { IP_VS_DBG_PKT(12, svc->af, pp, skb, 0, "Not scheduling reply for existing connection"); __ip_vs_conn_put(cp); @@ -386,10 +434,10 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, /* * Persistent service */ - if (svc->flags & IP_VS_SVC_F_PERSISTENT) { - *ignored = 0; - return ip_vs_sched_persist(svc, skb, pptr); - } + if (svc->flags & IP_VS_SVC_F_PERSISTENT) + return ip_vs_sched_persist(svc, skb, pptr[0], pptr[1], ignored); + + *ignored = 0; /* * Non-persistent service @@ -402,8 +450,6 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, return NULL; } - *ignored = 0; - dest = svc->scheduler->schedule(svc, skb); if (dest == NULL) { IP_VS_DBG(1, "Schedule: no dest found.\n"); @@ -419,13 +465,17 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, */ { struct ip_vs_conn_param p; - ip_vs_conn_fill_param(svc->af, iph.protocol, &iph.saddr, - pptr[0], &iph.daddr, pptr[1], &p); + + ip_vs_conn_fill_param(svc->net, svc->af, iph.protocol, + &iph.saddr, pptr[0], &iph.daddr, pptr[1], + &p); cp = ip_vs_conn_new(&p, &dest->addr, dest->port ? dest->port : pptr[1], - flags, dest); - if (!cp) + flags, dest, skb->mark); + if (!cp) { + *ignored = -1; return NULL; + } } IP_VS_DBG_BUF(6, "Schedule fwd:%c c:%s:%u v:%s:%u " @@ -447,11 +497,14 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, * no destination is available for a new connection. */ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, - struct ip_vs_protocol *pp) + struct ip_vs_proto_data *pd) { + struct net *net; + struct netns_ipvs *ipvs; __be16 _ports[2], *pptr; struct ip_vs_iphdr iph; int unicast; + ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph); pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports); @@ -459,18 +512,20 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, ip_vs_service_put(svc); return NF_DROP; } + net = skb_net(skb); #ifdef CONFIG_IP_VS_IPV6 if (svc->af == AF_INET6) unicast = ipv6_addr_type(&iph.daddr.in6) & IPV6_ADDR_UNICAST; else #endif - unicast = (inet_addr_type(&init_net, iph.daddr.ip) == RTN_UNICAST); + unicast = (inet_addr_type(net, iph.daddr.ip) == RTN_UNICAST); /* if it is fwmark-based service, the cache_bypass sysctl is up and the destination is a non-local unicast, then create a cache_bypass connection entry */ - if (sysctl_ip_vs_cache_bypass && svc->fwmark && unicast) { + ipvs = net_ipvs(net); + if (ipvs->sysctl_cache_bypass && svc->fwmark && unicast) { int ret, cs; struct ip_vs_conn *cp; unsigned int flags = (svc->flags & IP_VS_SVC_F_ONEPACKET && @@ -484,12 +539,12 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, IP_VS_DBG(6, "%s(): create a cache_bypass entry\n", __func__); { struct ip_vs_conn_param p; - ip_vs_conn_fill_param(svc->af, iph.protocol, + ip_vs_conn_fill_param(svc->net, svc->af, iph.protocol, &iph.saddr, pptr[0], &iph.daddr, pptr[1], &p); cp = ip_vs_conn_new(&p, &daddr, 0, IP_VS_CONN_F_BYPASS | flags, - NULL); + NULL, skb->mark); if (!cp) return NF_DROP; } @@ -498,10 +553,10 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, ip_vs_in_stats(cp, skb); /* set state */ - cs = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp); + cs = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pd); /* transmit the first SYN packet */ - ret = cp->packet_xmit(skb, cp, pp); + ret = cp->packet_xmit(skb, cp, pd->pp); /* do not touch skb anymore */ atomic_inc(&cp->in_pkts); @@ -682,6 +737,7 @@ static int handle_response_icmp(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, unsigned int offset, unsigned int ihl) { + struct netns_ipvs *ipvs; unsigned int verdict = NF_DROP; if (IP_VS_FWD_METHOD(cp) != 0) { @@ -703,6 +759,8 @@ static int handle_response_icmp(int af, struct sk_buff *skb, if (!skb_make_writable(skb, offset)) goto out; + ipvs = net_ipvs(skb_net(skb)); + #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) ip_vs_nat_icmp_v6(skb, pp, cp, 1); @@ -712,11 +770,11 @@ static int handle_response_icmp(int af, struct sk_buff *skb, #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) { - if (sysctl_ip_vs_snat_reroute && ip6_route_me_harder(skb) != 0) + if (ipvs->sysctl_snat_reroute && ip6_route_me_harder(skb) != 0) goto out; } else #endif - if ((sysctl_ip_vs_snat_reroute || + if ((ipvs->sysctl_snat_reroute || skb_rtable(skb)->rt_flags & RTCF_LOCAL) && ip_route_me_harder(skb, RTN_LOCAL) != 0) goto out; @@ -808,7 +866,7 @@ static int ip_vs_out_icmp(struct sk_buff *skb, int *related, ip_vs_fill_iphdr(AF_INET, cih, &ciph); /* The embedded headers contain source and dest in reverse order */ - cp = pp->conn_out_get(AF_INET, skb, pp, &ciph, offset, 1); + cp = pp->conn_out_get(AF_INET, skb, &ciph, offset, 1); if (!cp) return NF_ACCEPT; @@ -885,7 +943,7 @@ static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related, ip_vs_fill_iphdr(AF_INET6, cih, &ciph); /* The embedded headers contain source and dest in reverse order */ - cp = pp->conn_out_get(AF_INET6, skb, pp, &ciph, offset, 1); + cp = pp->conn_out_get(AF_INET6, skb, &ciph, offset, 1); if (!cp) return NF_ACCEPT; @@ -924,9 +982,12 @@ static inline int is_tcp_reset(const struct sk_buff *skb, int nh_len) * Used for NAT and local client. */ static unsigned int -handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, +handle_response(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, struct ip_vs_conn *cp, int ihl) { + struct ip_vs_protocol *pp = pd->pp; + struct netns_ipvs *ipvs; + IP_VS_DBG_PKT(11, af, pp, skb, 0, "Outgoing packet"); if (!skb_make_writable(skb, ihl)) @@ -961,13 +1022,15 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, * if it came from this machine itself. So re-compute * the routing information. */ + ipvs = net_ipvs(skb_net(skb)); + #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) { - if (sysctl_ip_vs_snat_reroute && ip6_route_me_harder(skb) != 0) + if (ipvs->sysctl_snat_reroute && ip6_route_me_harder(skb) != 0) goto drop; } else #endif - if ((sysctl_ip_vs_snat_reroute || + if ((ipvs->sysctl_snat_reroute || skb_rtable(skb)->rt_flags & RTCF_LOCAL) && ip_route_me_harder(skb, RTN_LOCAL) != 0) goto drop; @@ -975,7 +1038,7 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, IP_VS_DBG_PKT(10, af, pp, skb, 0, "After SNAT"); ip_vs_out_stats(cp, skb); - ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp); + ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pd); skb->ipvs_property = 1; if (!(cp->flags & IP_VS_CONN_F_NFCT)) ip_vs_notrack(skb); @@ -999,9 +1062,12 @@ drop: static unsigned int ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af) { + struct net *net = NULL; struct ip_vs_iphdr iph; struct ip_vs_protocol *pp; + struct ip_vs_proto_data *pd; struct ip_vs_conn *cp; + struct netns_ipvs *ipvs; EnterFunction(11); @@ -1022,6 +1088,7 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af) if (unlikely(!skb_dst(skb))) return NF_ACCEPT; + net = skb_net(skb); ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) { @@ -1045,9 +1112,10 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af) ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); } - pp = ip_vs_proto_get(iph.protocol); - if (unlikely(!pp)) + pd = ip_vs_proto_data_get(net, iph.protocol); + if (unlikely(!pd)) return NF_ACCEPT; + pp = pd->pp; /* reassemble IP fragments */ #ifdef CONFIG_IP_VS_IPV6 @@ -1073,11 +1141,12 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af) /* * Check if the packet belongs to an existing entry */ - cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0); + cp = pp->conn_out_get(af, skb, &iph, iph.len, 0); + ipvs = net_ipvs(net); if (likely(cp)) - return handle_response(af, skb, pp, cp, iph.len); - if (sysctl_ip_vs_nat_icmp_send && + return handle_response(af, skb, pd, cp, iph.len); + if (ipvs->sysctl_nat_icmp_send && (pp->protocol == IPPROTO_TCP || pp->protocol == IPPROTO_UDP || pp->protocol == IPPROTO_SCTP)) { @@ -1087,7 +1156,7 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af) sizeof(_ports), _ports); if (pptr == NULL) return NF_ACCEPT; /* Not for me */ - if (ip_vs_lookup_real_service(af, iph.protocol, + if (ip_vs_lookup_real_service(net, af, iph.protocol, &iph.saddr, pptr[0])) { /* @@ -1202,12 +1271,14 @@ ip_vs_local_reply6(unsigned int hooknum, struct sk_buff *skb, static int ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) { + struct net *net = NULL; struct iphdr *iph; struct icmphdr _icmph, *ic; struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */ struct ip_vs_iphdr ciph; struct ip_vs_conn *cp; struct ip_vs_protocol *pp; + struct ip_vs_proto_data *pd; unsigned int offset, ihl, verdict; union nf_inet_addr snet; @@ -1249,9 +1320,11 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) if (cih == NULL) return NF_ACCEPT; /* The packet looks wrong, ignore */ - pp = ip_vs_proto_get(cih->protocol); - if (!pp) + net = skb_net(skb); + pd = ip_vs_proto_data_get(net, cih->protocol); + if (!pd) return NF_ACCEPT; + pp = pd->pp; /* Is the embedded protocol header present? */ if (unlikely(cih->frag_off & htons(IP_OFFSET) && @@ -1265,10 +1338,10 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) ip_vs_fill_iphdr(AF_INET, cih, &ciph); /* The embedded headers contain source and dest in reverse order */ - cp = pp->conn_in_get(AF_INET, skb, pp, &ciph, offset, 1); + cp = pp->conn_in_get(AF_INET, skb, &ciph, offset, 1); if (!cp) { /* The packet could also belong to a local client */ - cp = pp->conn_out_get(AF_INET, skb, pp, &ciph, offset, 1); + cp = pp->conn_out_get(AF_INET, skb, &ciph, offset, 1); if (cp) { snet.ip = iph->saddr; return handle_response_icmp(AF_INET, skb, &snet, @@ -1312,6 +1385,7 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) static int ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum) { + struct net *net = NULL; struct ipv6hdr *iph; struct icmp6hdr _icmph, *ic; struct ipv6hdr _ciph, *cih; /* The ip header contained @@ -1319,6 +1393,7 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum) struct ip_vs_iphdr ciph; struct ip_vs_conn *cp; struct ip_vs_protocol *pp; + struct ip_vs_proto_data *pd; unsigned int offset, verdict; union nf_inet_addr snet; struct rt6_info *rt; @@ -1361,9 +1436,11 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum) if (cih == NULL) return NF_ACCEPT; /* The packet looks wrong, ignore */ - pp = ip_vs_proto_get(cih->nexthdr); - if (!pp) + net = skb_net(skb); + pd = ip_vs_proto_data_get(net, cih->nexthdr); + if (!pd) return NF_ACCEPT; + pp = pd->pp; /* Is the embedded protocol header present? */ /* TODO: we don't support fragmentation at the moment anyways */ @@ -1377,10 +1454,10 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum) ip_vs_fill_iphdr(AF_INET6, cih, &ciph); /* The embedded headers contain source and dest in reverse order */ - cp = pp->conn_in_get(AF_INET6, skb, pp, &ciph, offset, 1); + cp = pp->conn_in_get(AF_INET6, skb, &ciph, offset, 1); if (!cp) { /* The packet could also belong to a local client */ - cp = pp->conn_out_get(AF_INET6, skb, pp, &ciph, offset, 1); + cp = pp->conn_out_get(AF_INET6, skb, &ciph, offset, 1); if (cp) { ipv6_addr_copy(&snet.in6, &iph->saddr); return handle_response_icmp(AF_INET6, skb, &snet, @@ -1423,10 +1500,13 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum) static unsigned int ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) { + struct net *net; struct ip_vs_iphdr iph; struct ip_vs_protocol *pp; + struct ip_vs_proto_data *pd; struct ip_vs_conn *cp; int ret, restart, pkts; + struct netns_ipvs *ipvs; /* Already marked as IPVS request or reply? */ if (skb->ipvs_property) @@ -1480,20 +1560,21 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); } + net = skb_net(skb); /* Protocol supported? */ - pp = ip_vs_proto_get(iph.protocol); - if (unlikely(!pp)) + pd = ip_vs_proto_data_get(net, iph.protocol); + if (unlikely(!pd)) return NF_ACCEPT; - + pp = pd->pp; /* * Check if the packet belongs to an existing connection entry */ - cp = pp->conn_in_get(af, skb, pp, &iph, iph.len, 0); + cp = pp->conn_in_get(af, skb, &iph, iph.len, 0); if (unlikely(!cp)) { int v; - if (!pp->conn_schedule(af, skb, pp, &v, &cp)) + if (!pp->conn_schedule(af, skb, pd, &v, &cp)) return v; } @@ -1505,12 +1586,13 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) } IP_VS_DBG_PKT(11, af, pp, skb, 0, "Incoming packet"); - + net = skb_net(skb); + ipvs = net_ipvs(net); /* Check the server status */ if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) { /* the destination server is not available */ - if (sysctl_ip_vs_expire_nodest_conn) { + if (ipvs->sysctl_expire_nodest_conn) { /* try to expire the connection immediately */ ip_vs_conn_expire_now(cp); } @@ -1521,7 +1603,7 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) } ip_vs_in_stats(cp, skb); - restart = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp); + restart = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pd); if (cp->packet_xmit) ret = cp->packet_xmit(skb, cp, pp); /* do not touch skb anymore */ @@ -1535,35 +1617,41 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) * * Sync connection if it is about to close to * encorage the standby servers to update the connections timeout + * + * For ONE_PKT let ip_vs_sync_conn() do the filter work. */ - pkts = atomic_add_return(1, &cp->in_pkts); - if (af == AF_INET && (ip_vs_sync_state & IP_VS_STATE_MASTER) && + + if (cp->flags & IP_VS_CONN_F_ONE_PACKET) + pkts = ipvs->sysctl_sync_threshold[0]; + else + pkts = atomic_add_return(1, &cp->in_pkts); + + if ((ipvs->sync_state & IP_VS_STATE_MASTER) && cp->protocol == IPPROTO_SCTP) { if ((cp->state == IP_VS_SCTP_S_ESTABLISHED && - (pkts % sysctl_ip_vs_sync_threshold[1] - == sysctl_ip_vs_sync_threshold[0])) || + (pkts % ipvs->sysctl_sync_threshold[1] + == ipvs->sysctl_sync_threshold[0])) || (cp->old_state != cp->state && ((cp->state == IP_VS_SCTP_S_CLOSED) || (cp->state == IP_VS_SCTP_S_SHUT_ACK_CLI) || (cp->state == IP_VS_SCTP_S_SHUT_ACK_SER)))) { - ip_vs_sync_conn(cp); + ip_vs_sync_conn(net, cp); goto out; } } /* Keep this block last: TCP and others with pp->num_states <= 1 */ - else if (af == AF_INET && - (ip_vs_sync_state & IP_VS_STATE_MASTER) && + else if ((ipvs->sync_state & IP_VS_STATE_MASTER) && (((cp->protocol != IPPROTO_TCP || cp->state == IP_VS_TCP_S_ESTABLISHED) && - (pkts % sysctl_ip_vs_sync_threshold[1] - == sysctl_ip_vs_sync_threshold[0])) || + (pkts % ipvs->sysctl_sync_threshold[1] + == ipvs->sysctl_sync_threshold[0])) || ((cp->protocol == IPPROTO_TCP) && (cp->old_state != cp->state) && ((cp->state == IP_VS_TCP_S_FIN_WAIT) || (cp->state == IP_VS_TCP_S_CLOSE) || (cp->state == IP_VS_TCP_S_CLOSE_WAIT) || (cp->state == IP_VS_TCP_S_TIME_WAIT))))) - ip_vs_sync_conn(cp); + ip_vs_sync_conn(net, cp); out: cp->old_state = cp->state; @@ -1782,7 +1870,41 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = { }, #endif }; +/* + * Initialize IP Virtual Server netns mem. + */ +static int __net_init __ip_vs_init(struct net *net) +{ + struct netns_ipvs *ipvs; + + ipvs = net_generic(net, ip_vs_net_id); + if (ipvs == NULL) { + pr_err("%s(): no memory.\n", __func__); + return -ENOMEM; + } + ipvs->net = net; + /* Counters used for creating unique names */ + ipvs->gen = atomic_read(&ipvs_netns_cnt); + atomic_inc(&ipvs_netns_cnt); + net->ipvs = ipvs; + printk(KERN_INFO "IPVS: Creating netns size=%lu id=%d\n", + sizeof(struct netns_ipvs), ipvs->gen); + return 0; +} + +static void __net_exit __ip_vs_cleanup(struct net *net) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + IP_VS_DBG(10, "ipvs netns %d released\n", ipvs->gen); +} + +static struct pernet_operations ipvs_core_ops = { + .init = __ip_vs_init, + .exit = __ip_vs_cleanup, + .id = &ip_vs_net_id, + .size = sizeof(struct netns_ipvs), +}; /* * Initialize IP Virtual Server @@ -1791,8 +1913,11 @@ static int __init ip_vs_init(void) { int ret; - ip_vs_estimator_init(); + ret = register_pernet_subsys(&ipvs_core_ops); /* Alloc ip_vs struct */ + if (ret < 0) + return ret; + ip_vs_estimator_init(); ret = ip_vs_control_init(); if (ret < 0) { pr_err("can't setup control.\n"); @@ -1813,15 +1938,23 @@ static int __init ip_vs_init(void) goto cleanup_app; } + ret = ip_vs_sync_init(); + if (ret < 0) { + pr_err("can't setup sync data.\n"); + goto cleanup_conn; + } + ret = nf_register_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops)); if (ret < 0) { pr_err("can't register hooks.\n"); - goto cleanup_conn; + goto cleanup_sync; } pr_info("ipvs loaded.\n"); return ret; +cleanup_sync: + ip_vs_sync_cleanup(); cleanup_conn: ip_vs_conn_cleanup(); cleanup_app: @@ -1831,17 +1964,20 @@ static int __init ip_vs_init(void) ip_vs_control_cleanup(); cleanup_estimator: ip_vs_estimator_cleanup(); + unregister_pernet_subsys(&ipvs_core_ops); /* free ip_vs struct */ return ret; } static void __exit ip_vs_cleanup(void) { nf_unregister_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops)); + ip_vs_sync_cleanup(); ip_vs_conn_cleanup(); ip_vs_app_cleanup(); ip_vs_protocol_cleanup(); ip_vs_control_cleanup(); ip_vs_estimator_cleanup(); + unregister_pernet_subsys(&ipvs_core_ops); /* free ip_vs struct */ pr_info("ipvs unloaded.\n"); } diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 22f7ad5101a..09ca2ce2f2b 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -38,6 +38,7 @@ #include <linux/mutex.h> #include <net/net_namespace.h> +#include <linux/nsproxy.h> #include <net/ip.h> #ifdef CONFIG_IP_VS_IPV6 #include <net/ipv6.h> @@ -57,42 +58,7 @@ static DEFINE_MUTEX(__ip_vs_mutex); /* lock for service table */ static DEFINE_RWLOCK(__ip_vs_svc_lock); -/* lock for table with the real services */ -static DEFINE_RWLOCK(__ip_vs_rs_lock); - -/* lock for state and timeout tables */ -static DEFINE_SPINLOCK(ip_vs_securetcp_lock); - -/* lock for drop entry handling */ -static DEFINE_SPINLOCK(__ip_vs_dropentry_lock); - -/* lock for drop packet handling */ -static DEFINE_SPINLOCK(__ip_vs_droppacket_lock); - -/* 1/rate drop and drop-entry variables */ -int ip_vs_drop_rate = 0; -int ip_vs_drop_counter = 0; -static atomic_t ip_vs_dropentry = ATOMIC_INIT(0); - -/* number of virtual services */ -static int ip_vs_num_services = 0; - /* sysctl variables */ -static int sysctl_ip_vs_drop_entry = 0; -static int sysctl_ip_vs_drop_packet = 0; -static int sysctl_ip_vs_secure_tcp = 0; -static int sysctl_ip_vs_amemthresh = 1024; -static int sysctl_ip_vs_am_droprate = 10; -int sysctl_ip_vs_cache_bypass = 0; -int sysctl_ip_vs_expire_nodest_conn = 0; -int sysctl_ip_vs_expire_quiescent_template = 0; -int sysctl_ip_vs_sync_threshold[2] = { 3, 50 }; -int sysctl_ip_vs_nat_icmp_send = 0; -#ifdef CONFIG_IP_VS_NFCT -int sysctl_ip_vs_conntrack; -#endif -int sysctl_ip_vs_snat_reroute = 1; - #ifdef CONFIG_IP_VS_DEBUG static int sysctl_ip_vs_debug_level = 0; @@ -105,7 +71,8 @@ int ip_vs_get_debug_level(void) #ifdef CONFIG_IP_VS_IPV6 /* Taken from rt6_fill_node() in net/ipv6/route.c, is there a better way? */ -static int __ip_vs_addr_is_local_v6(const struct in6_addr *addr) +static int __ip_vs_addr_is_local_v6(struct net *net, + const struct in6_addr *addr) { struct rt6_info *rt; struct flowi fl = { @@ -114,7 +81,7 @@ static int __ip_vs_addr_is_local_v6(const struct in6_addr *addr) .fl6_src = { .s6_addr32 = {0, 0, 0, 0} }, }; - rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl); + rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl); if (rt && rt->rt6i_dev && (rt->rt6i_dev->flags & IFF_LOOPBACK)) return 1; @@ -125,7 +92,7 @@ static int __ip_vs_addr_is_local_v6(const struct in6_addr *addr) * update_defense_level is called from keventd and from sysctl, * so it needs to protect itself from softirqs */ -static void update_defense_level(void) +static void update_defense_level(struct netns_ipvs *ipvs) { struct sysinfo i; static int old_secure_tcp = 0; @@ -141,73 +108,73 @@ static void update_defense_level(void) /* si_swapinfo(&i); */ /* availmem = availmem - (i.totalswap - i.freeswap); */ - nomem = (availmem < sysctl_ip_vs_amemthresh); + nomem = (availmem < ipvs->sysctl_amemthresh); local_bh_disable(); /* drop_entry */ - spin_lock(&__ip_vs_dropentry_lock); - switch (sysctl_ip_vs_drop_entry) { + spin_lock(&ipvs->dropentry_lock); + switch (ipvs->sysctl_drop_entry) { case 0: - atomic_set(&ip_vs_dropentry, 0); + atomic_set(&ipvs->dropentry, 0); break; case 1: if (nomem) { - atomic_set(&ip_vs_dropentry, 1); - sysctl_ip_vs_drop_entry = 2; + atomic_set(&ipvs->dropentry, 1); + ipvs->sysctl_drop_entry = 2; } else { - atomic_set(&ip_vs_dropentry, 0); + atomic_set(&ipvs->dropentry, 0); } break; case 2: if (nomem) { - atomic_set(&ip_vs_dropentry, 1); + atomic_set(&ipvs->dropentry, 1); } else { - atomic_set(&ip_vs_dropentry, 0); - sysctl_ip_vs_drop_entry = 1; + atomic_set(&ipvs->dropentry, 0); + ipvs->sysctl_drop_entry = 1; }; break; case 3: - atomic_set(&ip_vs_dropentry, 1); + atomic_set(&ipvs->dropentry, 1); break; } - spin_unlock(&__ip_vs_dropentry_lock); + spin_unlock(&ipvs->dropentry_lock); /* drop_packet */ - spin_lock(&__ip_vs_droppacket_lock); - switch (sysctl_ip_vs_drop_packet) { + spin_lock(&ipvs->droppacket_lock); + switch (ipvs->sysctl_drop_packet) { case 0: - ip_vs_drop_rate = 0; + ipvs->drop_rate = 0; break; case 1: if (nomem) { - ip_vs_drop_rate = ip_vs_drop_counter - = sysctl_ip_vs_amemthresh / - (sysctl_ip_vs_amemthresh-availmem); - sysctl_ip_vs_drop_packet = 2; + ipvs->drop_rate = ipvs->drop_counter + = ipvs->sysctl_amemthresh / + (ipvs->sysctl_amemthresh-availmem); + ipvs->sysctl_drop_packet = 2; } else { - ip_vs_drop_rate = 0; + ipvs->drop_rate = 0; } break; case 2: if (nomem) { - ip_vs_drop_rate = ip_vs_drop_counter - = sysctl_ip_vs_amemthresh / - (sysctl_ip_vs_amemthresh-availmem); + ipvs->drop_rate = ipvs->drop_counter + = ipvs->sysctl_amemthresh / + (ipvs->sysctl_amemthresh-availmem); } else { - ip_vs_drop_rate = 0; - sysctl_ip_vs_drop_packet = 1; + ipvs->drop_rate = 0; + ipvs->sysctl_drop_packet = 1; } break; case 3: - ip_vs_drop_rate = sysctl_ip_vs_am_droprate; + ipvs->drop_rate = ipvs->sysctl_am_droprate; break; } - spin_unlock(&__ip_vs_droppacket_lock); + spin_unlock(&ipvs->droppacket_lock); /* secure_tcp */ - spin_lock(&ip_vs_securetcp_lock); - switch (sysctl_ip_vs_secure_tcp) { + spin_lock(&ipvs->securetcp_lock); + switch (ipvs->sysctl_secure_tcp) { case 0: if (old_secure_tcp >= 2) to_change = 0; @@ -216,7 +183,7 @@ static void update_defense_level(void) if (nomem) { if (old_secure_tcp < 2) to_change = 1; - sysctl_ip_vs_secure_tcp = 2; + ipvs->sysctl_secure_tcp = 2; } else { if (old_secure_tcp >= 2) to_change = 0; @@ -229,7 +196,7 @@ static void update_defense_level(void) } else { if (old_secure_tcp >= 2) to_change = 0; - sysctl_ip_vs_secure_tcp = 1; + ipvs->sysctl_secure_tcp = 1; } break; case 3: @@ -237,10 +204,11 @@ static void update_defense_level(void) to_change = 1; break; } - old_secure_tcp = sysctl_ip_vs_secure_tcp; + old_secure_tcp = ipvs->sysctl_secure_tcp; if (to_change >= 0) - ip_vs_protocol_timeout_change(sysctl_ip_vs_secure_tcp>1); - spin_unlock(&ip_vs_securetcp_lock); + ip_vs_protocol_timeout_change(ipvs, + ipvs->sysctl_secure_tcp > 1); + spin_unlock(&ipvs->securetcp_lock); local_bh_enable(); } @@ -250,16 +218,16 @@ static void update_defense_level(void) * Timer for checking the defense */ #define DEFENSE_TIMER_PERIOD 1*HZ -static void defense_work_handler(struct work_struct *work); -static DECLARE_DELAYED_WORK(defense_work, defense_work_handler); static void defense_work_handler(struct work_struct *work) { - update_defense_level(); - if (atomic_read(&ip_vs_dropentry)) - ip_vs_random_dropentry(); + struct netns_ipvs *ipvs = + container_of(work, struct netns_ipvs, defense_work.work); - schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD); + update_defense_level(ipvs); + if (atomic_read(&ipvs->dropentry)) + ip_vs_random_dropentry(ipvs->net); + schedule_delayed_work(&ipvs->defense_work, DEFENSE_TIMER_PERIOD); } int @@ -287,33 +255,13 @@ static struct list_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE]; /* the service table hashed by fwmark */ static struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE]; -/* - * Hash table: for real service lookups - */ -#define IP_VS_RTAB_BITS 4 -#define IP_VS_RTAB_SIZE (1 << IP_VS_RTAB_BITS) -#define IP_VS_RTAB_MASK (IP_VS_RTAB_SIZE - 1) - -static struct list_head ip_vs_rtable[IP_VS_RTAB_SIZE]; - -/* - * Trash for destinations - */ -static LIST_HEAD(ip_vs_dest_trash); - -/* - * FTP & NULL virtual service counters - */ -static atomic_t ip_vs_ftpsvc_counter = ATOMIC_INIT(0); -static atomic_t ip_vs_nullsvc_counter = ATOMIC_INIT(0); - /* * Returns hash value for virtual service */ -static __inline__ unsigned -ip_vs_svc_hashkey(int af, unsigned proto, const union nf_inet_addr *addr, - __be16 port) +static inline unsigned +ip_vs_svc_hashkey(struct net *net, int af, unsigned proto, + const union nf_inet_addr *addr, __be16 port) { register unsigned porth = ntohs(port); __be32 addr_fold = addr->ip; @@ -323,6 +271,7 @@ ip_vs_svc_hashkey(int af, unsigned proto, const union nf_inet_addr *addr, addr_fold = addr->ip6[0]^addr->ip6[1]^ addr->ip6[2]^addr->ip6[3]; #endif + addr_fold ^= ((size_t)net>>8); return (proto^ntohl(addr_fold)^(porth>>IP_VS_SVC_TAB_BITS)^porth) & IP_VS_SVC_TAB_MASK; @@ -331,13 +280,13 @@ ip_vs_svc_hashkey(int af, unsigned proto, const union nf_inet_addr *addr, /* * Returns hash value of fwmark for virtual service lookup */ -static __inline__ unsigned ip_vs_svc_fwm_hashkey(__u32 fwmark) +static inline unsigned ip_vs_svc_fwm_hashkey(struct net *net, __u32 fwmark) { - return fwmark & IP_VS_SVC_TAB_MASK; + return (((size_t)net>>8) ^ fwmark) & IP_VS_SVC_TAB_MASK; } /* - * Hashes a service in the ip_vs_svc_table by <proto,addr,port> + * Hashes a service in the ip_vs_svc_table by <netns,proto,addr,port> * or in the ip_vs_svc_fwm_table by fwmark. * Should be called with locked tables. */ @@ -353,16 +302,16 @@ static int ip_vs_svc_hash(struct ip_vs_service *svc) if (svc->fwmark == 0) { /* - * Hash it by <protocol,addr,port> in ip_vs_svc_table + * Hash it by <netns,protocol,addr,port> in ip_vs_svc_table */ - hash = ip_vs_svc_hashkey(svc->af, svc->protocol, &svc->addr, - svc->port); + hash = ip_vs_svc_hashkey(svc->net, svc->af, svc->protocol, + &svc->addr, svc->port); list_add(&svc->s_list, &ip_vs_svc_table[hash]); } else { /* - * Hash it by fwmark in ip_vs_svc_fwm_table + * Hash it by fwmark in svc_fwm_table */ - hash = ip_vs_svc_fwm_hashkey(svc->fwmark); + hash = ip_vs_svc_fwm_hashkey(svc->net, svc->fwmark); list_add(&svc->f_list, &ip_vs_svc_fwm_table[hash]); } @@ -374,7 +323,7 @@ static int ip_vs_svc_hash(struct ip_vs_service *svc) /* - * Unhashes a service from ip_vs_svc_table/ip_vs_svc_fwm_table. + * Unhashes a service from svc_table / svc_fwm_table. * Should be called with locked tables. */ static int ip_vs_svc_unhash(struct ip_vs_service *svc) @@ -386,10 +335,10 @@ static int ip_vs_svc_unhash(struct ip_vs_service *svc) } if (svc->fwmark == 0) { - /* Remove it from the ip_vs_svc_table table */ + /* Remove it from the svc_table table */ list_del(&svc->s_list); } else { - /* Remove it from the ip_vs_svc_fwm_table table */ + /* Remove it from the svc_fwm_table table */ list_del(&svc->f_list); } @@ -400,23 +349,24 @@ static int ip_vs_svc_unhash(struct ip_vs_service *svc) /* - * Get service by {proto,addr,port} in the service table. + * Get service by {netns, proto,addr,port} in the service table. */ static inline struct ip_vs_service * -__ip_vs_service_find(int af, __u16 protocol, const union nf_inet_addr *vaddr, - __be16 vport) +__ip_vs_service_find(struct net *net, int af, __u16 protocol, + const union nf_inet_addr *vaddr, __be16 vport) { unsigned hash; struct ip_vs_service *svc; /* Check for "full" addressed entries */ - hash = ip_vs_svc_hashkey(af, protocol, vaddr, vport); + hash = ip_vs_svc_hashkey(net, af, protocol, vaddr, vport); list_for_each_entry(svc, &ip_vs_svc_table[hash], s_list){ if ((svc->af == af) && ip_vs_addr_equal(af, &svc->addr, vaddr) && (svc->port == vport) - && (svc->protocol == protocol)) { + && (svc->protocol == protocol) + && net_eq(svc->net, net)) { /* HIT */ return svc; } @@ -430,16 +380,17 @@ __ip_vs_service_find(int af, __u16 protocol, const union nf_inet_addr *vaddr, * Get service by {fwmark} in the service table. */ static inline struct ip_vs_service * -__ip_vs_svc_fwm_find(int af, __u32 fwmark) +__ip_vs_svc_fwm_find(struct net *net, int af, __u32 fwmark) { unsigned hash; struct ip_vs_service *svc; /* Check for fwmark addressed entries */ - hash = ip_vs_svc_fwm_hashkey(fwmark); + hash = ip_vs_svc_fwm_hashkey(net, fwmark); list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) { - if (svc->fwmark == fwmark && svc->af == af) { + if (svc->fwmark == fwmark && svc->af == af + && net_eq(svc->net, net)) { /* HIT */ return svc; } @@ -449,42 +400,44 @@ __ip_vs_svc_fwm_find(int af, __u32 fwmark) } struct ip_vs_service * -ip_vs_service_get(int af, __u32 fwmark, __u16 protocol, +ip_vs_service_get(struct net *net, int af, __u32 fwmark, __u16 protocol, const union nf_inet_addr *vaddr, __be16 vport) { struct ip_vs_service *svc; + struct netns_ipvs *ipvs = net_ipvs(net); read_lock(&__ip_vs_svc_lock); /* * Check the table hashed by fwmark first */ - if (fwmark && (svc = __ip_vs_svc_fwm_find(af, fwmark))) + svc = __ip_vs_svc_fwm_find(net, af, fwmark); + if (fwmark && svc) goto out; /* * Check the table hashed by <protocol,addr,port> * for "full" addressed entries */ - svc = __ip_vs_service_find(af, protocol, vaddr, vport); + svc = __ip_vs_service_find(net, af, protocol, vaddr, vport); if (svc == NULL && protocol == IPPROTO_TCP - && atomic_read(&ip_vs_ftpsvc_counter) + && atomic_read(&ipvs->ftpsvc_counter) && (vport == FTPDATA || ntohs(vport) >= PROT_SOCK)) { /* * Check if ftp service entry exists, the packet * might belong to FTP data connections. */ - svc = __ip_vs_service_find(af, protocol, vaddr, FTPPORT); + svc = __ip_vs_service_find(net, af, protocol, vaddr, FTPPORT); } if (svc == NULL - && atomic_read(&ip_vs_nullsvc_counter)) { + && atomic_read(&ipvs->nullsvc_counter)) { /* * Check if the catch-all port (port zero) exists */ - svc = __ip_vs_service_find(af, protocol, vaddr, 0); + svc = __ip_vs_service_find(net, af, protocol, vaddr, 0); } out: @@ -519,6 +472,7 @@ __ip_vs_unbind_svc(struct ip_vs_dest *dest) svc->fwmark, IP_VS_DBG_ADDR(svc->af, &svc->addr), ntohs(svc->port), atomic_read(&svc->usecnt)); + free_percpu(svc->stats.cpustats); kfree(svc); } } @@ -545,10 +499,10 @@ static inline unsigned ip_vs_rs_hashkey(int af, } /* - * Hashes ip_vs_dest in ip_vs_rtable by <proto,addr,port>. + * Hashes ip_vs_dest in rs_table by <proto,addr,port>. * should be called with locked tables. */ -static int ip_vs_rs_hash(struct ip_vs_dest *dest) +static int ip_vs_rs_hash(struct netns_ipvs *ipvs, struct ip_vs_dest *dest) { unsigned hash; @@ -562,19 +516,19 @@ static int ip_vs_rs_hash(struct ip_vs_dest *dest) */ hash = ip_vs_rs_hashkey(dest->af, &dest->addr, dest->port); - list_add(&dest->d_list, &ip_vs_rtable[hash]); + list_add(&dest->d_list, &ipvs->rs_table[hash]); return 1; } /* - * UNhashes ip_vs_dest from ip_vs_rtable. + * UNhashes ip_vs_dest from rs_table. * should be called with locked tables. */ static int ip_vs_rs_unhash(struct ip_vs_dest *dest) { /* - * Remove it from the ip_vs_rtable table. + * Remove it from the rs_table table. */ if (!list_empty(&dest->d_list)) { list_del(&dest->d_list); @@ -588,10 +542,11 @@ static int ip_vs_rs_unhash(struct ip_vs_dest *dest) * Lookup real service by <proto,addr,port> in the real service table. */ struct ip_vs_dest * -ip_vs_lookup_real_service(int af, __u16 protocol, +ip_vs_lookup_real_service(struct net *net, int af, __u16 protocol, const union nf_inet_addr *daddr, __be16 dport) { + struct netns_ipvs *ipvs = net_ipvs(net); unsigned hash; struct ip_vs_dest *dest; @@ -601,19 +556,19 @@ ip_vs_lookup_real_service(int af, __u16 protocol, */ hash = ip_vs_rs_hashkey(af, daddr, dport); - read_lock(&__ip_vs_rs_lock); - list_for_each_entry(dest, &ip_vs_rtable[hash], d_list) { + read_lock(&ipvs->rs_lock); + list_for_each_entry(dest, &ipvs->rs_table[hash], d_list) { if ((dest->af == af) && ip_vs_addr_equal(af, &dest->addr, daddr) && (dest->port == dport) && ((dest->protocol == protocol) || dest->vfwmark)) { /* HIT */ - read_unlock(&__ip_vs_rs_lock); + read_unlock(&ipvs->rs_lock); return dest; } } - read_unlock(&__ip_vs_rs_lock); + read_unlock(&ipvs->rs_lock); return NULL; } @@ -652,15 +607,16 @@ ip_vs_lookup_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr, * ip_vs_lookup_real_service() looked promissing, but * seems not working as expected. */ -struct ip_vs_dest *ip_vs_find_dest(int af, const union nf_inet_addr *daddr, +struct ip_vs_dest *ip_vs_find_dest(struct net *net, int af, + const union nf_inet_addr *daddr, __be16 dport, const union nf_inet_addr *vaddr, - __be16 vport, __u16 protocol) + __be16 vport, __u16 protocol, __u32 fwmark) { struct ip_vs_dest *dest; struct ip_vs_service *svc; - svc = ip_vs_service_get(af, 0, protocol, vaddr, vport); + svc = ip_vs_service_get(net, af, fwmark, protocol, vaddr, vport); if (!svc) return NULL; dest = ip_vs_lookup_dest(svc, daddr, dport); @@ -685,11 +641,12 @@ ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr, __be16 dport) { struct ip_vs_dest *dest, *nxt; + struct netns_ipvs *ipvs = net_ipvs(svc->net); /* * Find the destination in trash */ - list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) { + list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, n_list) { IP_VS_DBG_BUF(3, "Destination %u/%s:%u still in trash, " "dest->refcnt=%d\n", dest->vfwmark, @@ -720,6 +677,7 @@ ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr, list_del(&dest->n_list); ip_vs_dst_reset(dest); __ip_vs_unbind_svc(dest); + free_percpu(dest->stats.cpustats); kfree(dest); } } @@ -737,14 +695,16 @@ ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr, * are expired, and the refcnt of each destination in the trash must * be 1, so we simply release them here. */ -static void ip_vs_trash_cleanup(void) +static void ip_vs_trash_cleanup(struct net *net) { struct ip_vs_dest *dest, *nxt; + struct netns_ipvs *ipvs = net_ipvs(net); - list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) { + list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, n_list) { list_del(&dest->n_list); ip_vs_dst_reset(dest); __ip_vs_unbind_svc(dest); + free_percpu(dest->stats.cpustats); kfree(dest); } } @@ -768,6 +728,7 @@ static void __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest, struct ip_vs_dest_user_kern *udest, int add) { + struct netns_ipvs *ipvs = net_ipvs(svc->net); int conn_flags; /* set the weight and the flags */ @@ -780,12 +741,12 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest, conn_flags |= IP_VS_CONN_F_NOOUTPUT; } else { /* - * Put the real service in ip_vs_rtable if not present. + * Put the real service in rs_table if not present. * For now only for NAT! */ - write_lock_bh(&__ip_vs_rs_lock); - ip_vs_rs_hash(dest); - write_unlock_bh(&__ip_vs_rs_lock); + write_lock_bh(&ipvs->rs_lock); + ip_vs_rs_hash(ipvs, dest); + write_unlock_bh(&ipvs->rs_lock); } atomic_set(&dest->conn_flags, conn_flags); @@ -813,7 +774,7 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest, spin_unlock(&dest->dst_lock); if (add) - ip_vs_new_estimator(&dest->stats); + ip_vs_new_estimator(svc->net, &dest->stats); write_lock_bh(&__ip_vs_svc_lock); @@ -850,12 +811,12 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest, atype = ipv6_addr_type(&udest->addr.in6); if ((!(atype & IPV6_ADDR_UNICAST) || atype & IPV6_ADDR_LINKLOCAL) && - !__ip_vs_addr_is_local_v6(&udest->addr.in6)) + !__ip_vs_addr_is_local_v6(svc->net, &udest->addr.in6)) return -EINVAL; } else #endif { - atype = inet_addr_type(&init_net, udest->addr.ip); + atype = inet_addr_type(svc->net, udest->addr.ip); if (atype != RTN_LOCAL && atype != RTN_UNICAST) return -EINVAL; } @@ -865,6 +826,11 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest, pr_err("%s(): no memory.\n", __func__); return -ENOMEM; } + dest->stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats); + if (!dest->stats.cpustats) { + pr_err("%s() alloc_percpu failed\n", __func__); + goto err_alloc; + } dest->af = svc->af; dest->protocol = svc->protocol; @@ -888,6 +854,10 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest, LeaveFunction(2); return 0; + +err_alloc: + kfree(dest); + return -ENOMEM; } @@ -1006,16 +976,18 @@ ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) /* * Delete a destination (must be already unlinked from the service) */ -static void __ip_vs_del_dest(struct ip_vs_dest *dest) +static void __ip_vs_del_dest(struct net *net, struct ip_vs_dest *dest) { - ip_vs_kill_estimator(&dest->stats); + struct netns_ipvs *ipvs = net_ipvs(net); + + ip_vs_kill_estimator(net, &dest->stats); /* * Remove it from the d-linked list with the real services. */ - write_lock_bh(&__ip_vs_rs_lock); + write_lock_bh(&ipvs->rs_lock); ip_vs_rs_unhash(dest); - write_unlock_bh(&__ip_vs_rs_lock); + write_unlock_bh(&ipvs->rs_lock); /* * Decrease the refcnt of the dest, and free the dest @@ -1034,6 +1006,7 @@ static void __ip_vs_del_dest(struct ip_vs_dest *dest) and only one user context can update virtual service at a time, so the operation here is OK */ atomic_dec(&dest->svc->refcnt); + free_percpu(dest->stats.cpustats); kfree(dest); } else { IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, " @@ -1041,7 +1014,7 @@ static void __ip_vs_del_dest(struct ip_vs_dest *dest) IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port), atomic_read(&dest->refcnt)); - list_add(&dest->n_list, &ip_vs_dest_trash); + list_add(&dest->n_list, &ipvs->dest_trash); atomic_inc(&dest->refcnt); } } @@ -1105,7 +1078,7 @@ ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) /* * Delete the destination */ - __ip_vs_del_dest(dest); + __ip_vs_del_dest(svc->net, dest); LeaveFunction(2); @@ -1117,13 +1090,14 @@ ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) * Add a service into the service hash table */ static int -ip_vs_add_service(struct ip_vs_service_user_kern *u, +ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u, struct ip_vs_service **svc_p) { int ret = 0; struct ip_vs_scheduler *sched = NULL; struct ip_vs_pe *pe = NULL; struct ip_vs_service *svc = NULL; + struct netns_ipvs *ipvs = net_ipvs(net); /* increase the module use count */ ip_vs_use_count_inc(); @@ -1137,7 +1111,7 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u, } if (u->pe_name && *u->pe_name) { - pe = ip_vs_pe_get(u->pe_name); + pe = ip_vs_pe_getbyname(u->pe_name); if (pe == NULL) { pr_info("persistence engine module ip_vs_pe_%s " "not found\n", u->pe_name); @@ -1159,6 +1133,11 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u, ret = -ENOMEM; goto out_err; } + svc->stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats); + if (!svc->stats.cpustats) { + pr_err("%s() alloc_percpu failed\n", __func__); + goto out_err; + } /* I'm the first user of the service */ atomic_set(&svc->usecnt, 0); @@ -1172,6 +1151,7 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u, svc->flags = u->flags; svc->timeout = u->timeout * HZ; svc->netmask = u->netmask; + svc->net = net; INIT_LIST_HEAD(&svc->destinations); rwlock_init(&svc->sched_lock); @@ -1189,15 +1169,15 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u, /* Update the virtual service counters */ if (svc->port == FTPPORT) - atomic_inc(&ip_vs_ftpsvc_counter); + atomic_inc(&ipvs->ftpsvc_counter); else if (svc->port == 0) - atomic_inc(&ip_vs_nullsvc_counter); + atomic_inc(&ipvs->nullsvc_counter); - ip_vs_new_estimator(&svc->stats); + ip_vs_new_estimator(net, &svc->stats); /* Count only IPv4 services for old get/setsockopt interface */ if (svc->af == AF_INET) - ip_vs_num_services++; + ipvs->num_services++; /* Hash the service into the service table */ write_lock_bh(&__ip_vs_svc_lock); @@ -1207,6 +1187,7 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u, *svc_p = svc; return 0; + out_err: if (svc != NULL) { ip_vs_unbind_scheduler(svc); @@ -1215,6 +1196,8 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u, ip_vs_app_inc_put(svc->inc); local_bh_enable(); } + if (svc->stats.cpustats) + free_percpu(svc->stats.cpustats); kfree(svc); } ip_vs_scheduler_put(sched); @@ -1248,7 +1231,7 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u) old_sched = sched; if (u->pe_name && *u->pe_name) { - pe = ip_vs_pe_get(u->pe_name); + pe = ip_vs_pe_getbyname(u->pe_name); if (pe == NULL) { pr_info("persistence engine module ip_vs_pe_%s " "not found\n", u->pe_name); @@ -1334,14 +1317,15 @@ static void __ip_vs_del_service(struct ip_vs_service *svc) struct ip_vs_dest *dest, *nxt; struct ip_vs_scheduler *old_sched; struct ip_vs_pe *old_pe; + struct netns_ipvs *ipvs = net_ipvs(svc->net); pr_info("%s: enter\n", __func__); /* Count only IPv4 services for old get/setsockopt interface */ if (svc->af == AF_INET) - ip_vs_num_services--; + ipvs->num_services--; - ip_vs_kill_estimator(&svc->stats); + ip_vs_kill_estimator(svc->net, &svc->stats); /* Unbind scheduler */ old_sched = svc->scheduler; @@ -1364,16 +1348,16 @@ static void __ip_vs_del_service(struct ip_vs_service *svc) */ list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) { __ip_vs_unlink_dest(svc, dest, 0); - __ip_vs_del_dest(dest); + __ip_vs_del_dest(svc->net, dest); } /* * Update the virtual service counters */ if (svc->port == FTPPORT) - atomic_dec(&ip_vs_ftpsvc_counter); + atomic_dec(&ipvs->ftpsvc_counter); else if (svc->port == 0) - atomic_dec(&ip_vs_nullsvc_counter); + atomic_dec(&ipvs->nullsvc_counter); /* * Free the service if nobody refers to it @@ -1383,6 +1367,7 @@ static void __ip_vs_del_service(struct ip_vs_service *svc) svc->fwmark, IP_VS_DBG_ADDR(svc->af, &svc->addr), ntohs(svc->port), atomic_read(&svc->usecnt)); + free_percpu(svc->stats.cpustats); kfree(svc); } @@ -1428,17 +1413,19 @@ static int ip_vs_del_service(struct ip_vs_service *svc) /* * Flush all the virtual services */ -static int ip_vs_flush(void) +static int ip_vs_flush(struct net *net) { int idx; struct ip_vs_service *svc, *nxt; /* - * Flush the service table hashed by <protocol,addr,port> + * Flush the service table hashed by <netns,protocol,addr,port> */ for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx], s_list) { - ip_vs_unlink_service(svc); + list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx], + s_list) { + if (net_eq(svc->net, net)) + ip_vs_unlink_service(svc); } } @@ -1448,7 +1435,8 @@ static int ip_vs_flush(void) for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { list_for_each_entry_safe(svc, nxt, &ip_vs_svc_fwm_table[idx], f_list) { - ip_vs_unlink_service(svc); + if (net_eq(svc->net, net)) + ip_vs_unlink_service(svc); } } @@ -1472,24 +1460,26 @@ static int ip_vs_zero_service(struct ip_vs_service *svc) return 0; } -static int ip_vs_zero_all(void) +static int ip_vs_zero_all(struct net *net) { int idx; struct ip_vs_service *svc; for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { - ip_vs_zero_service(svc); + if (net_eq(svc->net, net)) + ip_vs_zero_service(svc); } } for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { - ip_vs_zero_service(svc); + if (net_eq(svc->net, net)) + ip_vs_zero_service(svc); } } - ip_vs_zero_stats(&ip_vs_stats); + ip_vs_zero_stats(net_ipvs(net)->tot_stats); return 0; } @@ -1498,6 +1488,7 @@ static int proc_do_defense_mode(ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { + struct net *net = current->nsproxy->net_ns; int *valp = table->data; int val = *valp; int rc; @@ -1508,7 +1499,7 @@ proc_do_defense_mode(ctl_table *table, int write, /* Restore the correct value */ *valp = val; } else { - update_defense_level(); + update_defense_level(net_ipvs(net)); } } return rc; @@ -1534,45 +1525,54 @@ proc_do_sync_threshold(ctl_table *table, int write, return rc; } +static int +proc_do_sync_mode(ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int *valp = table->data; + int val = *valp; + int rc; + + rc = proc_dointvec(table, write, buffer, lenp, ppos); + if (write && (*valp != val)) { + if ((*valp < 0) || (*valp > 1)) { + /* Restore the correct value */ + *valp = val; + } else { + struct net *net = current->nsproxy->net_ns; + ip_vs_sync_switch_mode(net, val); + } + } + return rc; +} /* * IPVS sysctl table (under the /proc/sys/net/ipv4/vs/) + * Do not change order or insert new entries without + * align with netns init in __ip_vs_control_init() */ static struct ctl_table vs_vars[] = { { .procname = "amemthresh", - .data = &sysctl_ip_vs_amemthresh, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, -#ifdef CONFIG_IP_VS_DEBUG - { - .procname = "debug_level", - .data = &sysctl_ip_vs_debug_level, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif { .procname = "am_droprate", - .data = &sysctl_ip_vs_am_droprate, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "drop_entry", - .data = &sysctl_ip_vs_drop_entry, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_do_defense_mode, }, { .procname = "drop_packet", - .data = &sysctl_ip_vs_drop_packet, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_do_defense_mode, @@ -1580,7 +1580,6 @@ static struct ctl_table vs_vars[] = { #ifdef CONFIG_IP_VS_NFCT { .procname = "conntrack", - .data = &sysctl_ip_vs_conntrack, .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec, @@ -1588,18 +1587,62 @@ static struct ctl_table vs_vars[] = { #endif { .procname = "secure_tcp", - .data = &sysctl_ip_vs_secure_tcp, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_do_defense_mode, }, { .procname = "snat_reroute", - .data = &sysctl_ip_vs_snat_reroute, .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec, }, + { + .procname = "sync_version", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_do_sync_mode, + }, + { + .procname = "cache_bypass", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "expire_nodest_conn", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "expire_quiescent_template", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "sync_threshold", + .maxlen = + sizeof(((struct netns_ipvs *)0)->sysctl_sync_threshold), + .mode = 0644, + .proc_handler = proc_do_sync_threshold, + }, + { + .procname = "nat_icmp_send", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#ifdef CONFIG_IP_VS_DEBUG + { + .procname = "debug_level", + .data = &sysctl_ip_vs_debug_level, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif #if 0 { .procname = "timeout_established", @@ -1686,41 +1729,6 @@ static struct ctl_table vs_vars[] = { .proc_handler = proc_dointvec_jiffies, }, #endif - { - .procname = "cache_bypass", - .data = &sysctl_ip_vs_cache_bypass, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "expire_nodest_conn", - .data = &sysctl_ip_vs_expire_nodest_conn, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "expire_quiescent_template", - .data = &sysctl_ip_vs_expire_quiescent_template, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "sync_threshold", - .data = &sysctl_ip_vs_sync_threshold, - .maxlen = sizeof(sysctl_ip_vs_sync_threshold), - .mode = 0644, - .proc_handler = proc_do_sync_threshold, - }, - { - .procname = "nat_icmp_send", - .data = &sysctl_ip_vs_nat_icmp_send, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, { } }; @@ -1732,11 +1740,10 @@ const struct ctl_path net_vs_ctl_path[] = { }; EXPORT_SYMBOL_GPL(net_vs_ctl_path); -static struct ctl_table_header * sysctl_header; - #ifdef CONFIG_PROC_FS struct ip_vs_iter { + struct seq_net_private p; /* Do not move this, netns depends upon it*/ struct list_head *table; int bucket; }; @@ -1763,6 +1770,7 @@ static inline const char *ip_vs_fwd_name(unsigned flags) /* Get the Nth entry in the two lists */ static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos) { + struct net *net = seq_file_net(seq); struct ip_vs_iter *iter = seq->private; int idx; struct ip_vs_service *svc; @@ -1770,7 +1778,7 @@ static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos) /* look in hash by protocol */ for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { - if (pos-- == 0){ + if (net_eq(svc->net, net) && pos-- == 0) { iter->table = ip_vs_svc_table; iter->bucket = idx; return svc; @@ -1781,7 +1789,7 @@ static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos) /* keep looking in fwmark */ for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { - if (pos-- == 0) { + if (net_eq(svc->net, net) && pos-- == 0) { iter->table = ip_vs_svc_fwm_table; iter->bucket = idx; return svc; @@ -1935,7 +1943,7 @@ static const struct seq_operations ip_vs_info_seq_ops = { static int ip_vs_info_open(struct inode *inode, struct file *file) { - return seq_open_private(file, &ip_vs_info_seq_ops, + return seq_open_net(inode, file, &ip_vs_info_seq_ops, sizeof(struct ip_vs_iter)); } @@ -1949,13 +1957,11 @@ static const struct file_operations ip_vs_info_fops = { #endif -struct ip_vs_stats ip_vs_stats = { - .lock = __SPIN_LOCK_UNLOCKED(ip_vs_stats.lock), -}; - #ifdef CONFIG_PROC_FS static int ip_vs_stats_show(struct seq_file *seq, void *v) { + struct net *net = seq_file_single_net(seq); + struct ip_vs_stats *tot_stats = net_ipvs(net)->tot_stats; /* 01234567 01234567 01234567 0123456701234567 0123456701234567 */ seq_puts(seq, @@ -1963,29 +1969,29 @@ static int ip_vs_stats_show(struct seq_file *seq, void *v) seq_printf(seq, " Conns Packets Packets Bytes Bytes\n"); - spin_lock_bh(&ip_vs_stats.lock); - seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", ip_vs_stats.ustats.conns, - ip_vs_stats.ustats.inpkts, ip_vs_stats.ustats.outpkts, - (unsigned long long) ip_vs_stats.ustats.inbytes, - (unsigned long long) ip_vs_stats.ustats.outbytes); + spin_lock_bh(&tot_stats->lock); + seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", tot_stats->ustats.conns, + tot_stats->ustats.inpkts, tot_stats->ustats.outpkts, + (unsigned long long) tot_stats->ustats.inbytes, + (unsigned long long) tot_stats->ustats.outbytes); /* 01234567 01234567 01234567 0123456701234567 0123456701234567 */ seq_puts(seq, " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n"); seq_printf(seq,"%8X %8X %8X %16X %16X\n", - ip_vs_stats.ustats.cps, - ip_vs_stats.ustats.inpps, - ip_vs_stats.ustats.outpps, - ip_vs_stats.ustats.inbps, - ip_vs_stats.ustats.outbps); - spin_unlock_bh(&ip_vs_stats.lock); + tot_stats->ustats.cps, + tot_stats->ustats.inpps, + tot_stats->ustats.outpps, + tot_stats->ustats.inbps, + tot_stats->ustats.outbps); + spin_unlock_bh(&tot_stats->lock); return 0; } static int ip_vs_stats_seq_open(struct inode *inode, struct file *file) { - return single_open(file, ip_vs_stats_show, NULL); + return single_open_net(inode, file, ip_vs_stats_show); } static const struct file_operations ip_vs_stats_fops = { @@ -1996,13 +2002,68 @@ static const struct file_operations ip_vs_stats_fops = { .release = single_release, }; +static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v) +{ + struct net *net = seq_file_single_net(seq); + struct ip_vs_stats *tot_stats = net_ipvs(net)->tot_stats; + int i; + +/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */ + seq_puts(seq, + " Total Incoming Outgoing Incoming Outgoing\n"); + seq_printf(seq, + "CPU Conns Packets Packets Bytes Bytes\n"); + + for_each_possible_cpu(i) { + struct ip_vs_cpu_stats *u = per_cpu_ptr(net->ipvs->cpustats, i); + seq_printf(seq, "%3X %8X %8X %8X %16LX %16LX\n", + i, u->ustats.conns, u->ustats.inpkts, + u->ustats.outpkts, (__u64)u->ustats.inbytes, + (__u64)u->ustats.outbytes); + } + + spin_lock_bh(&tot_stats->lock); + seq_printf(seq, " ~ %8X %8X %8X %16LX %16LX\n\n", + tot_stats->ustats.conns, tot_stats->ustats.inpkts, + tot_stats->ustats.outpkts, + (unsigned long long) tot_stats->ustats.inbytes, + (unsigned long long) tot_stats->ustats.outbytes); + +/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */ + seq_puts(seq, + " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n"); + seq_printf(seq, " %8X %8X %8X %16X %16X\n", + tot_stats->ustats.cps, + tot_stats->ustats.inpps, + tot_stats->ustats.outpps, + tot_stats->ustats.inbps, + tot_stats->ustats.outbps); + spin_unlock_bh(&tot_stats->lock); + + return 0; +} + +static int ip_vs_stats_percpu_seq_open(struct inode *inode, struct file *file) +{ + return single_open_net(inode, file, ip_vs_stats_percpu_show); +} + +static const struct file_operations ip_vs_stats_percpu_fops = { + .owner = THIS_MODULE, + .open = ip_vs_stats_percpu_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; #endif /* * Set timeout values for tcp tcpfin udp in the timeout_table. */ -static int ip_vs_set_timeout(struct ip_vs_timeout_user *u) +static int ip_vs_set_timeout(struct net *net, struct ip_vs_timeout_user *u) { + struct ip_vs_proto_data *pd; + IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n", u->tcp_timeout, u->tcp_fin_timeout, @@ -2010,19 +2071,22 @@ static int ip_vs_set_timeout(struct ip_vs_timeout_user *u) #ifdef CONFIG_IP_VS_PROTO_TCP if (u->tcp_timeout) { - ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED] + pd = ip_vs_proto_data_get(net, IPPROTO_TCP); + pd->timeout_table[IP_VS_TCP_S_ESTABLISHED] = u->tcp_timeout * HZ; } if (u->tcp_fin_timeout) { - ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT] + pd = ip_vs_proto_data_get(net, IPPROTO_TCP); + pd->timeout_table[IP_VS_TCP_S_FIN_WAIT] = u->tcp_fin_timeout * HZ; } #endif #ifdef CONFIG_IP_VS_PROTO_UDP if (u->udp_timeout) { - ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL] + pd = ip_vs_proto_data_get(net, IPPROTO_UDP); + pd->timeout_table[IP_VS_UDP_S_NORMAL] = u->udp_timeout * HZ; } #endif @@ -2087,6 +2151,7 @@ static void ip_vs_copy_udest_compat(struct ip_vs_dest_user_kern *udest, static int do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) { + struct net *net = sock_net(sk); int ret; unsigned char arg[MAX_ARG_LEN]; struct ip_vs_service_user *usvc_compat; @@ -2121,19 +2186,20 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) if (cmd == IP_VS_SO_SET_FLUSH) { /* Flush the virtual service */ - ret = ip_vs_flush(); + ret = ip_vs_flush(net); goto out_unlock; } else if (cmd == IP_VS_SO_SET_TIMEOUT) { /* Set timeout values for (tcp tcpfin udp) */ - ret = ip_vs_set_timeout((struct ip_vs_timeout_user *)arg); + ret = ip_vs_set_timeout(net, (struct ip_vs_timeout_user *)arg); goto out_unlock; } else if (cmd == IP_VS_SO_SET_STARTDAEMON) { struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg; - ret = start_sync_thread(dm->state, dm->mcast_ifn, dm->syncid); + ret = start_sync_thread(net, dm->state, dm->mcast_ifn, + dm->syncid); goto out_unlock; } else if (cmd == IP_VS_SO_SET_STOPDAEMON) { struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg; - ret = stop_sync_thread(dm->state); + ret = stop_sync_thread(net, dm->state); goto out_unlock; } @@ -2148,7 +2214,7 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) if (cmd == IP_VS_SO_SET_ZERO) { /* if no service address is set, zero counters in all */ if (!usvc.fwmark && !usvc.addr.ip && !usvc.port) { - ret = ip_vs_zero_all(); + ret = ip_vs_zero_all(net); goto out_unlock; } } @@ -2165,10 +2231,10 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) /* Lookup the exact service by <protocol, addr, port> or fwmark */ if (usvc.fwmark == 0) - svc = __ip_vs_service_find(usvc.af, usvc.protocol, + svc = __ip_vs_service_find(net, usvc.af, usvc.protocol, &usvc.addr, usvc.port); else - svc = __ip_vs_svc_fwm_find(usvc.af, usvc.fwmark); + svc = __ip_vs_svc_fwm_find(net, usvc.af, usvc.fwmark); if (cmd != IP_VS_SO_SET_ADD && (svc == NULL || svc->protocol != usvc.protocol)) { @@ -2181,7 +2247,7 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) if (svc != NULL) ret = -EEXIST; else - ret = ip_vs_add_service(&usvc, &svc); + ret = ip_vs_add_service(net, &usvc, &svc); break; case IP_VS_SO_SET_EDIT: ret = ip_vs_edit_service(svc, &usvc); @@ -2241,7 +2307,8 @@ ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src) } static inline int -__ip_vs_get_service_entries(const struct ip_vs_get_services *get, +__ip_vs_get_service_entries(struct net *net, + const struct ip_vs_get_services *get, struct ip_vs_get_services __user *uptr) { int idx, count=0; @@ -2252,7 +2319,7 @@ __ip_vs_get_service_entries(const struct ip_vs_get_services *get, for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { /* Only expose IPv4 entries to old interface */ - if (svc->af != AF_INET) + if (svc->af != AF_INET || !net_eq(svc->net, net)) continue; if (count >= get->num_services) @@ -2271,7 +2338,7 @@ __ip_vs_get_service_entries(const struct ip_vs_get_services *get, for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { /* Only expose IPv4 entries to old interface */ - if (svc->af != AF_INET) + if (svc->af != AF_INET || !net_eq(svc->net, net)) continue; if (count >= get->num_services) @@ -2291,7 +2358,7 @@ __ip_vs_get_service_entries(const struct ip_vs_get_services *get, } static inline int -__ip_vs_get_dest_entries(const struct ip_vs_get_dests *get, +__ip_vs_get_dest_entries(struct net *net, const struct ip_vs_get_dests *get, struct ip_vs_get_dests __user *uptr) { struct ip_vs_service *svc; @@ -2299,9 +2366,9 @@ __ip_vs_get_dest_entries(const struct ip_vs_get_dests *get, int ret = 0; if (get->fwmark) - svc = __ip_vs_svc_fwm_find(AF_INET, get->fwmark); + svc = __ip_vs_svc_fwm_find(net, AF_INET, get->fwmark); else - svc = __ip_vs_service_find(AF_INET, get->protocol, &addr, + svc = __ip_vs_service_find(net, AF_INET, get->protocol, &addr, get->port); if (svc) { @@ -2336,17 +2403,19 @@ __ip_vs_get_dest_entries(const struct ip_vs_get_dests *get, } static inline void -__ip_vs_get_timeouts(struct ip_vs_timeout_user *u) +__ip_vs_get_timeouts(struct net *net, struct ip_vs_timeout_user *u) { + struct ip_vs_proto_data *pd; + #ifdef CONFIG_IP_VS_PROTO_TCP - u->tcp_timeout = - ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ; - u->tcp_fin_timeout = - ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ; + pd = ip_vs_proto_data_get(net, IPPROTO_TCP); + u->tcp_timeout = pd->timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ; + u->tcp_fin_timeout = pd->timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ; #endif #ifdef CONFIG_IP_VS_PROTO_UDP + pd = ip_vs_proto_data_get(net, IPPROTO_UDP); u->udp_timeout = - ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL] / HZ; + pd->timeout_table[IP_VS_UDP_S_NORMAL] / HZ; #endif } @@ -2375,7 +2444,10 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) unsigned char arg[128]; int ret = 0; unsigned int copylen; + struct net *net = sock_net(sk); + struct netns_ipvs *ipvs = net_ipvs(net); + BUG_ON(!net); if (!capable(CAP_NET_ADMIN)) return -EPERM; @@ -2418,7 +2490,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) struct ip_vs_getinfo info; info.version = IP_VS_VERSION_CODE; info.size = ip_vs_conn_tab_size; - info.num_services = ip_vs_num_services; + info.num_services = ipvs->num_services; if (copy_to_user(user, &info, sizeof(info)) != 0) ret = -EFAULT; } @@ -2437,7 +2509,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) ret = -EINVAL; goto out; } - ret = __ip_vs_get_service_entries(get, user); + ret = __ip_vs_get_service_entries(net, get, user); } break; @@ -2450,10 +2522,11 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) entry = (struct ip_vs_service_entry *)arg; addr.ip = entry->addr; if (entry->fwmark) - svc = __ip_vs_svc_fwm_find(AF_INET, entry->fwmark); + svc = __ip_vs_svc_fwm_find(net, AF_INET, entry->fwmark); else - svc = __ip_vs_service_find(AF_INET, entry->protocol, - &addr, entry->port); + svc = __ip_vs_service_find(net, AF_INET, + entry->protocol, &addr, + entry->port); if (svc) { ip_vs_copy_service(entry, svc); if (copy_to_user(user, entry, sizeof(*entry)) != 0) @@ -2476,7 +2549,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) ret = -EINVAL; goto out; } - ret = __ip_vs_get_dest_entries(get, user); + ret = __ip_vs_get_dest_entries(net, get, user); } break; @@ -2484,7 +2557,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) { struct ip_vs_timeout_user t; - __ip_vs_get_timeouts(&t); + __ip_vs_get_timeouts(net, &t); if (copy_to_user(user, &t, sizeof(t)) != 0) ret = -EFAULT; } @@ -2495,15 +2568,17 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) struct ip_vs_daemon_user d[2]; memset(&d, 0, sizeof(d)); - if (ip_vs_sync_state & IP_VS_STATE_MASTER) { + if (ipvs->sync_state & IP_VS_STATE_MASTER) { d[0].state = IP_VS_STATE_MASTER; - strlcpy(d[0].mcast_ifn, ip_vs_master_mcast_ifn, sizeof(d[0].mcast_ifn)); - d[0].syncid = ip_vs_master_syncid; + strlcpy(d[0].mcast_ifn, ipvs->master_mcast_ifn, + sizeof(d[0].mcast_ifn)); + d[0].syncid = ipvs->master_syncid; } - if (ip_vs_sync_state & IP_VS_STATE_BACKUP) { + if (ipvs->sync_state & IP_VS_STATE_BACKUP) { d[1].state = IP_VS_STATE_BACKUP; - strlcpy(d[1].mcast_ifn, ip_vs_backup_mcast_ifn, sizeof(d[1].mcast_ifn)); - d[1].syncid = ip_vs_backup_syncid; + strlcpy(d[1].mcast_ifn, ipvs->backup_mcast_ifn, + sizeof(d[1].mcast_ifn)); + d[1].syncid = ipvs->backup_syncid; } if (copy_to_user(user, &d, sizeof(d)) != 0) ret = -EFAULT; @@ -2542,6 +2617,7 @@ static struct genl_family ip_vs_genl_family = { .name = IPVS_GENL_NAME, .version = IPVS_GENL_VERSION, .maxattr = IPVS_CMD_MAX, + .netnsok = true, /* Make ipvsadm to work on netns */ }; /* Policy used for first-level command attributes */ @@ -2696,11 +2772,12 @@ static int ip_vs_genl_dump_services(struct sk_buff *skb, int idx = 0, i; int start = cb->args[0]; struct ip_vs_service *svc; + struct net *net = skb_sknet(skb); mutex_lock(&__ip_vs_mutex); for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) { list_for_each_entry(svc, &ip_vs_svc_table[i], s_list) { - if (++idx <= start) + if (++idx <= start || !net_eq(svc->net, net)) continue; if (ip_vs_genl_dump_service(skb, svc, cb) < 0) { idx--; @@ -2711,7 +2788,7 @@ static int ip_vs_genl_dump_services(struct sk_buff *skb, for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) { list_for_each_entry(svc, &ip_vs_svc_fwm_table[i], f_list) { - if (++idx <= start) + if (++idx <= start || !net_eq(svc->net, net)) continue; if (ip_vs_genl_dump_service(skb, svc, cb) < 0) { idx--; @@ -2727,7 +2804,8 @@ nla_put_failure: return skb->len; } -static int ip_vs_genl_parse_service(struct ip_vs_service_user_kern *usvc, +static int ip_vs_genl_parse_service(struct net *net, + struct ip_vs_service_user_kern *usvc, struct nlattr *nla, int full_entry, struct ip_vs_service **ret_svc) { @@ -2770,9 +2848,9 @@ static int ip_vs_genl_parse_service(struct ip_vs_service_user_kern *usvc, } if (usvc->fwmark) - svc = __ip_vs_svc_fwm_find(usvc->af, usvc->fwmark); + svc = __ip_vs_svc_fwm_find(net, usvc->af, usvc->fwmark); else - svc = __ip_vs_service_find(usvc->af, usvc->protocol, + svc = __ip_vs_service_find(net, usvc->af, usvc->protocol, &usvc->addr, usvc->port); *ret_svc = svc; @@ -2809,13 +2887,14 @@ static int ip_vs_genl_parse_service(struct ip_vs_service_user_kern *usvc, return 0; } -static struct ip_vs_service *ip_vs_genl_find_service(struct nlattr *nla) +static struct ip_vs_service *ip_vs_genl_find_service(struct net *net, + struct nlattr *nla) { struct ip_vs_service_user_kern usvc; struct ip_vs_service *svc; int ret; - ret = ip_vs_genl_parse_service(&usvc, nla, 0, &svc); + ret = ip_vs_genl_parse_service(net, &usvc, nla, 0, &svc); return ret ? ERR_PTR(ret) : svc; } @@ -2883,6 +2962,7 @@ static int ip_vs_genl_dump_dests(struct sk_buff *skb, struct ip_vs_service *svc; struct ip_vs_dest *dest; struct nlattr *attrs[IPVS_CMD_ATTR_MAX + 1]; + struct net *net = skb_sknet(skb); mutex_lock(&__ip_vs_mutex); @@ -2891,7 +2971,8 @@ static int ip_vs_genl_dump_dests(struct sk_buff *skb, IPVS_CMD_ATTR_MAX, ip_vs_cmd_policy)) goto out_err; - svc = ip_vs_genl_find_service(attrs[IPVS_CMD_ATTR_SERVICE]); + + svc = ip_vs_genl_find_service(net, attrs[IPVS_CMD_ATTR_SERVICE]); if (IS_ERR(svc) || svc == NULL) goto out_err; @@ -3005,20 +3086,23 @@ nla_put_failure: static int ip_vs_genl_dump_daemons(struct sk_buff *skb, struct netlink_callback *cb) { + struct net *net = skb_net(skb); + struct netns_ipvs *ipvs = net_ipvs(net); + mutex_lock(&__ip_vs_mutex); - if ((ip_vs_sync_state & IP_VS_STATE_MASTER) && !cb->args[0]) { + if ((ipvs->sync_state & IP_VS_STATE_MASTER) && !cb->args[0]) { if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_MASTER, - ip_vs_master_mcast_ifn, - ip_vs_master_syncid, cb) < 0) + ipvs->master_mcast_ifn, + ipvs->master_syncid, cb) < 0) goto nla_put_failure; cb->args[0] = 1; } - if ((ip_vs_sync_state & IP_VS_STATE_BACKUP) && !cb->args[1]) { + if ((ipvs->sync_state & IP_VS_STATE_BACKUP) && !cb->args[1]) { if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_BACKUP, - ip_vs_backup_mcast_ifn, - ip_vs_backup_syncid, cb) < 0) + ipvs->backup_mcast_ifn, + ipvs->backup_syncid, cb) < 0) goto nla_put_failure; cb->args[1] = 1; @@ -3030,31 +3114,33 @@ nla_put_failure: return skb->len; } -static int ip_vs_genl_new_daemon(struct nlattr **attrs) +static int ip_vs_genl_new_daemon(struct net *net, struct nlattr **attrs) { if (!(attrs[IPVS_DAEMON_ATTR_STATE] && attrs[IPVS_DAEMON_ATTR_MCAST_IFN] && attrs[IPVS_DAEMON_ATTR_SYNC_ID])) return -EINVAL; - return start_sync_thread(nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]), + return start_sync_thread(net, + nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]), nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]), nla_get_u32(attrs[IPVS_DAEMON_ATTR_SYNC_ID])); } -static int ip_vs_genl_del_daemon(struct nlattr **attrs) +static int ip_vs_genl_del_daemon(struct net *net, struct nlattr **attrs) { if (!attrs[IPVS_DAEMON_ATTR_STATE]) return -EINVAL; - return stop_sync_thread(nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE])); + return stop_sync_thread(net, + nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE])); } -static int ip_vs_genl_set_config(struct nlattr **attrs) +static int ip_vs_genl_set_config(struct net *net, struct nlattr **attrs) { struct ip_vs_timeout_user t; - __ip_vs_get_timeouts(&t); + __ip_vs_get_timeouts(net, &t); if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP]) t.tcp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP]); @@ -3066,7 +3152,7 @@ static int ip_vs_genl_set_config(struct nlattr **attrs) if (attrs[IPVS_CMD_ATTR_TIMEOUT_UDP]) t.udp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_UDP]); - return ip_vs_set_timeout(&t); + return ip_vs_set_timeout(net, &t); } static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info) @@ -3076,16 +3162,20 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info) struct ip_vs_dest_user_kern udest; int ret = 0, cmd; int need_full_svc = 0, need_full_dest = 0; + struct net *net; + struct netns_ipvs *ipvs; + net = skb_sknet(skb); + ipvs = net_ipvs(net); cmd = info->genlhdr->cmd; mutex_lock(&__ip_vs_mutex); if (cmd == IPVS_CMD_FLUSH) { - ret = ip_vs_flush(); + ret = ip_vs_flush(net); goto out; } else if (cmd == IPVS_CMD_SET_CONFIG) { - ret = ip_vs_genl_set_config(info->attrs); + ret = ip_vs_genl_set_config(net, info->attrs); goto out; } else if (cmd == IPVS_CMD_NEW_DAEMON || cmd == IPVS_CMD_DEL_DAEMON) { @@ -3101,13 +3191,13 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info) } if (cmd == IPVS_CMD_NEW_DAEMON) - ret = ip_vs_genl_new_daemon(daemon_attrs); + ret = ip_vs_genl_new_daemon(net, daemon_attrs); else - ret = ip_vs_genl_del_daemon(daemon_attrs); + ret = ip_vs_genl_del_daemon(net, daemon_attrs); goto out; } else if (cmd == IPVS_CMD_ZERO && !info->attrs[IPVS_CMD_ATTR_SERVICE]) { - ret = ip_vs_zero_all(); + ret = ip_vs_zero_all(net); goto out; } @@ -3117,7 +3207,7 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info) if (cmd == IPVS_CMD_NEW_SERVICE || cmd == IPVS_CMD_SET_SERVICE) need_full_svc = 1; - ret = ip_vs_genl_parse_service(&usvc, + ret = ip_vs_genl_parse_service(net, &usvc, info->attrs[IPVS_CMD_ATTR_SERVICE], need_full_svc, &svc); if (ret) @@ -3147,7 +3237,7 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info) switch (cmd) { case IPVS_CMD_NEW_SERVICE: if (svc == NULL) - ret = ip_vs_add_service(&usvc, &svc); + ret = ip_vs_add_service(net, &usvc, &svc); else ret = -EEXIST; break; @@ -3185,7 +3275,11 @@ static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info) struct sk_buff *msg; void *reply; int ret, cmd, reply_cmd; + struct net *net; + struct netns_ipvs *ipvs; + net = skb_sknet(skb); + ipvs = net_ipvs(net); cmd = info->genlhdr->cmd; if (cmd == IPVS_CMD_GET_SERVICE) @@ -3214,7 +3308,8 @@ static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info) { struct ip_vs_service *svc; - svc = ip_vs_genl_find_service(info->attrs[IPVS_CMD_ATTR_SERVICE]); + svc = ip_vs_genl_find_service(net, + info->attrs[IPVS_CMD_ATTR_SERVICE]); if (IS_ERR(svc)) { ret = PTR_ERR(svc); goto out_err; @@ -3234,7 +3329,7 @@ static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info) { struct ip_vs_timeout_user t; - __ip_vs_get_timeouts(&t); + __ip_vs_get_timeouts(net, &t); #ifdef CONFIG_IP_VS_PROTO_TCP NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP, t.tcp_timeout); NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP_FIN, @@ -3380,62 +3475,172 @@ static void ip_vs_genl_unregister(void) /* End of Generic Netlink interface definitions */ +/* + * per netns intit/exit func. + */ +int __net_init __ip_vs_control_init(struct net *net) +{ + int idx; + struct netns_ipvs *ipvs = net_ipvs(net); + struct ctl_table *tbl; + + atomic_set(&ipvs->dropentry, 0); + spin_lock_init(&ipvs->dropentry_lock); + spin_lock_init(&ipvs->droppacket_lock); + spin_lock_init(&ipvs->securetcp_lock); + ipvs->rs_lock = __RW_LOCK_UNLOCKED(ipvs->rs_lock); + + /* Initialize rs_table */ + for (idx = 0; idx < IP_VS_RTAB_SIZE; idx++) + INIT_LIST_HEAD(&ipvs->rs_table[idx]); + + INIT_LIST_HEAD(&ipvs->dest_trash); + atomic_set(&ipvs->ftpsvc_counter, 0); + atomic_set(&ipvs->nullsvc_counter, 0); + + /* procfs stats */ + ipvs->tot_stats = kzalloc(sizeof(struct ip_vs_stats), GFP_KERNEL); + if (ipvs->tot_stats == NULL) { + pr_err("%s(): no memory.\n", __func__); + return -ENOMEM; + } + ipvs->cpustats = alloc_percpu(struct ip_vs_cpu_stats); + if (!ipvs->cpustats) { + pr_err("%s() alloc_percpu failed\n", __func__); + goto err_alloc; + } + spin_lock_init(&ipvs->tot_stats->lock); + + for (idx = 0; idx < IP_VS_RTAB_SIZE; idx++) + INIT_LIST_HEAD(&ipvs->rs_table[idx]); + + proc_net_fops_create(net, "ip_vs", 0, &ip_vs_info_fops); + proc_net_fops_create(net, "ip_vs_stats", 0, &ip_vs_stats_fops); + proc_net_fops_create(net, "ip_vs_stats_percpu", 0, + &ip_vs_stats_percpu_fops); + + if (!net_eq(net, &init_net)) { + tbl = kmemdup(vs_vars, sizeof(vs_vars), GFP_KERNEL); + if (tbl == NULL) + goto err_dup; + } else + tbl = vs_vars; + /* Initialize sysctl defaults */ + idx = 0; + ipvs->sysctl_amemthresh = 1024; + tbl[idx++].data = &ipvs->sysctl_amemthresh; + ipvs->sysctl_am_droprate = 10; + tbl[idx++].data = &ipvs->sysctl_am_droprate; + tbl[idx++].data = &ipvs->sysctl_drop_entry; + tbl[idx++].data = &ipvs->sysctl_drop_packet; +#ifdef CONFIG_IP_VS_NFCT + tbl[idx++].data = &ipvs->sysctl_conntrack; +#endif + tbl[idx++].data = &ipvs->sysctl_secure_tcp; + ipvs->sysctl_snat_reroute = 1; + tbl[idx++].data = &ipvs->sysctl_snat_reroute; + ipvs->sysctl_sync_ver = 1; + tbl[idx++].data = &ipvs->sysctl_sync_ver; + tbl[idx++].data = &ipvs->sysctl_cache_bypass; + tbl[idx++].data = &ipvs->sysctl_expire_nodest_conn; + tbl[idx++].data = &ipvs->sysctl_expire_quiescent_template; + ipvs->sysctl_sync_threshold[0] = 3; + ipvs->sysctl_sync_threshold[1] = 50; + tbl[idx].data = &ipvs->sysctl_sync_threshold; + tbl[idx++].maxlen = sizeof(ipvs->sysctl_sync_threshold); + tbl[idx++].data = &ipvs->sysctl_nat_icmp_send; + + + ipvs->sysctl_hdr = register_net_sysctl_table(net, net_vs_ctl_path, + vs_vars); + if (ipvs->sysctl_hdr == NULL) + goto err_reg; + ip_vs_new_estimator(net, ipvs->tot_stats); + ipvs->sysctl_tbl = tbl; + /* Schedule defense work */ + INIT_DELAYED_WORK(&ipvs->defense_work, defense_work_handler); + schedule_delayed_work(&ipvs->defense_work, DEFENSE_TIMER_PERIOD); + return 0; + +err_reg: + if (!net_eq(net, &init_net)) + kfree(tbl); +err_dup: + free_percpu(ipvs->cpustats); +err_alloc: + kfree(ipvs->tot_stats); + return -ENOMEM; +} + +static void __net_exit __ip_vs_control_cleanup(struct net *net) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + + ip_vs_trash_cleanup(net); + ip_vs_kill_estimator(net, ipvs->tot_stats); + cancel_delayed_work_sync(&ipvs->defense_work); + cancel_work_sync(&ipvs->defense_work.work); + unregister_net_sysctl_table(ipvs->sysctl_hdr); + proc_net_remove(net, "ip_vs_stats_percpu"); + proc_net_remove(net, "ip_vs_stats"); + proc_net_remove(net, "ip_vs"); + free_percpu(ipvs->cpustats); + kfree(ipvs->tot_stats); +} + +static struct pernet_operations ipvs_control_ops = { + .init = __ip_vs_control_init, + .exit = __ip_vs_control_cleanup, +}; int __init ip_vs_control_init(void) { - int ret; int idx; + int ret; EnterFunction(2); - /* Initialize ip_vs_svc_table, ip_vs_svc_fwm_table, ip_vs_rtable */ + /* Initialize svc_table, ip_vs_svc_fwm_table, rs_table */ for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { INIT_LIST_HEAD(&ip_vs_svc_table[idx]); INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]); } - for(idx = 0; idx < IP_VS_RTAB_SIZE; idx++) { - INIT_LIST_HEAD(&ip_vs_rtable[idx]); + + ret = register_pernet_subsys(&ipvs_control_ops); + if (ret) { + pr_err("cannot register namespace.\n"); + goto err; } - smp_wmb(); + + smp_wmb(); /* Do we really need it now ? */ ret = nf_register_sockopt(&ip_vs_sockopts); if (ret) { pr_err("cannot register sockopt.\n"); - return ret; + goto err_net; } ret = ip_vs_genl_register(); if (ret) { pr_err("cannot register Generic Netlink interface.\n"); nf_unregister_sockopt(&ip_vs_sockopts); - return ret; + goto err_net; } - proc_net_fops_create(&init_net, "ip_vs", 0, &ip_vs_info_fops); - proc_net_fops_create(&init_net, "ip_vs_stats",0, &ip_vs_stats_fops); - - sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars); - - ip_vs_new_estimator(&ip_vs_stats); - - /* Hook the defense timer */ - schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD); - LeaveFunction(2); return 0; + +err_net: + unregister_pernet_subsys(&ipvs_control_ops); +err: + return ret; } void ip_vs_control_cleanup(void) { EnterFunction(2); - ip_vs_trash_cleanup(); - cancel_delayed_work_sync(&defense_work); - cancel_work_sync(&defense_work.work); - ip_vs_kill_estimator(&ip_vs_stats); - unregister_sysctl_table(sysctl_header); - proc_net_remove(&init_net, "ip_vs_stats"); - proc_net_remove(&init_net, "ip_vs"); + unregister_pernet_subsys(&ipvs_control_ops); ip_vs_genl_unregister(); nf_unregister_sockopt(&ip_vs_sockopts); LeaveFunction(2); diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c index ff28801962e..f560a05c965 100644 --- a/net/netfilter/ipvs/ip_vs_est.c +++ b/net/netfilter/ipvs/ip_vs_est.c @@ -8,8 +8,12 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. * - * Changes: - * + * Changes: Hans Schillstrom <hans.schillstrom@ericsson.com> + * Network name space (netns) aware. + * Global data moved to netns i.e struct netns_ipvs + * Affected data: est_list and est_lock. + * estimation_timer() runs with timer per netns. + * get_stats()) do the per cpu summing. */ #define KMSG_COMPONENT "IPVS" @@ -48,11 +52,42 @@ */ -static void estimation_timer(unsigned long arg); +/* + * Make a summary from each cpu + */ +static void ip_vs_read_cpu_stats(struct ip_vs_stats_user *sum, + struct ip_vs_cpu_stats *stats) +{ + int i; + + for_each_possible_cpu(i) { + struct ip_vs_cpu_stats *s = per_cpu_ptr(stats, i); + unsigned int start; + __u64 inbytes, outbytes; + if (i) { + sum->conns += s->ustats.conns; + sum->inpkts += s->ustats.inpkts; + sum->outpkts += s->ustats.outpkts; + do { + start = u64_stats_fetch_begin_bh(&s->syncp); + inbytes = s->ustats.inbytes; + outbytes = s->ustats.outbytes; + } while (u64_stats_fetch_retry_bh(&s->syncp, start)); + sum->inbytes += inbytes; + sum->outbytes += outbytes; + } else { + sum->conns = s->ustats.conns; + sum->inpkts = s->ustats.inpkts; + sum->outpkts = s->ustats.outpkts; + do { + start = u64_stats_fetch_begin_bh(&s->syncp); + sum->inbytes = s->ustats.inbytes; + sum->outbytes = s->ustats.outbytes; + } while (u64_stats_fetch_retry_bh(&s->syncp, start)); + } + } +} -static LIST_HEAD(est_list); -static DEFINE_SPINLOCK(est_lock); -static DEFINE_TIMER(est_timer, estimation_timer, 0, 0); static void estimation_timer(unsigned long arg) { @@ -62,11 +97,16 @@ static void estimation_timer(unsigned long arg) u32 n_inpkts, n_outpkts; u64 n_inbytes, n_outbytes; u32 rate; + struct net *net = (struct net *)arg; + struct netns_ipvs *ipvs; - spin_lock(&est_lock); - list_for_each_entry(e, &est_list, list) { + ipvs = net_ipvs(net); + ip_vs_read_cpu_stats(&ipvs->tot_stats->ustats, ipvs->cpustats); + spin_lock(&ipvs->est_lock); + list_for_each_entry(e, &ipvs->est_list, list) { s = container_of(e, struct ip_vs_stats, est); + ip_vs_read_cpu_stats(&s->ustats, s->cpustats); spin_lock(&s->lock); n_conns = s->ustats.conns; n_inpkts = s->ustats.inpkts; @@ -75,38 +115,39 @@ static void estimation_timer(unsigned long arg) n_outbytes = s->ustats.outbytes; /* scaled by 2^10, but divided 2 seconds */ - rate = (n_conns - e->last_conns)<<9; + rate = (n_conns - e->last_conns) << 9; e->last_conns = n_conns; - e->cps += ((long)rate - (long)e->cps)>>2; - s->ustats.cps = (e->cps+0x1FF)>>10; + e->cps += ((long)rate - (long)e->cps) >> 2; + s->ustats.cps = (e->cps + 0x1FF) >> 10; - rate = (n_inpkts - e->last_inpkts)<<9; + rate = (n_inpkts - e->last_inpkts) << 9; e->last_inpkts = n_inpkts; - e->inpps += ((long)rate - (long)e->inpps)>>2; - s->ustats.inpps = (e->inpps+0x1FF)>>10; + e->inpps += ((long)rate - (long)e->inpps) >> 2; + s->ustats.inpps = (e->inpps + 0x1FF) >> 10; - rate = (n_outpkts - e->last_outpkts)<<9; + rate = (n_outpkts - e->last_outpkts) << 9; e->last_outpkts = n_outpkts; - e->outpps += ((long)rate - (long)e->outpps)>>2; - s->ustats.outpps = (e->outpps+0x1FF)>>10; + e->outpps += ((long)rate - (long)e->outpps) >> 2; + s->ustats.outpps = (e->outpps + 0x1FF) >> 10; - rate = (n_inbytes - e->last_inbytes)<<4; + rate = (n_inbytes - e->last_inbytes) << 4; e->last_inbytes = n_inbytes; - e->inbps += ((long)rate - (long)e->inbps)>>2; - s->ustats.inbps = (e->inbps+0xF)>>5; + e->inbps += ((long)rate - (long)e->inbps) >> 2; + s->ustats.inbps = (e->inbps + 0xF) >> 5; - rate = (n_outbytes - e->last_outbytes)<<4; + rate = (n_outbytes - e->last_outbytes) << 4; e->last_outbytes = n_outbytes; - e->outbps += ((long)rate - (long)e->outbps)>>2; - s->ustats.outbps = (e->outbps+0xF)>>5; + e->outbps += ((long)rate - (long)e->outbps) >> 2; + s->ustats.outbps = (e->outbps + 0xF) >> 5; spin_unlock(&s->lock); } - spin_unlock(&est_lock); - mod_timer(&est_timer, jiffies + 2*HZ); + spin_unlock(&ipvs->est_lock); + mod_timer(&ipvs->est_timer, jiffies + 2*HZ); } -void ip_vs_new_estimator(struct ip_vs_stats *stats) +void ip_vs_new_estimator(struct net *net, struct ip_vs_stats *stats) { + struct netns_ipvs *ipvs = net_ipvs(net); struct ip_vs_estimator *est = &stats->est; INIT_LIST_HEAD(&est->list); @@ -126,18 +167,19 @@ void ip_vs_new_estimator(struct ip_vs_stats *stats) est->last_outbytes = stats->ustats.outbytes; est->outbps = stats->ustats.outbps<<5; - spin_lock_bh(&est_lock); - list_add(&est->list, &est_list); - spin_unlock_bh(&est_lock); + spin_lock_bh(&ipvs->est_lock); + list_add(&est->list, &ipvs->est_list); + spin_unlock_bh(&ipvs->est_lock); } -void ip_vs_kill_estimator(struct ip_vs_stats *stats) +void ip_vs_kill_estimator(struct net *net, struct ip_vs_stats *stats) { + struct netns_ipvs *ipvs = net_ipvs(net); struct ip_vs_estimator *est = &stats->est; - spin_lock_bh(&est_lock); + spin_lock_bh(&ipvs->est_lock); list_del(&est->list); - spin_unlock_bh(&est_lock); + spin_unlock_bh(&ipvs->est_lock); } void ip_vs_zero_estimator(struct ip_vs_stats *stats) @@ -157,13 +199,35 @@ void ip_vs_zero_estimator(struct ip_vs_stats *stats) est->outbps = 0; } -int __init ip_vs_estimator_init(void) +static int __net_init __ip_vs_estimator_init(struct net *net) { - mod_timer(&est_timer, jiffies + 2 * HZ); + struct netns_ipvs *ipvs = net_ipvs(net); + + INIT_LIST_HEAD(&ipvs->est_list); + spin_lock_init(&ipvs->est_lock); + setup_timer(&ipvs->est_timer, estimation_timer, (unsigned long)net); + mod_timer(&ipvs->est_timer, jiffies + 2 * HZ); return 0; } +static void __net_exit __ip_vs_estimator_exit(struct net *net) +{ + del_timer_sync(&net_ipvs(net)->est_timer); +} +static struct pernet_operations ip_vs_app_ops = { + .init = __ip_vs_estimator_init, + .exit = __ip_vs_estimator_exit, +}; + +int __init ip_vs_estimator_init(void) +{ + int rv; + + rv = register_pernet_subsys(&ip_vs_app_ops); + return rv; +} + void ip_vs_estimator_cleanup(void) { - del_timer_sync(&est_timer); + unregister_pernet_subsys(&ip_vs_app_ops); } diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c index 75455000ad1..6b5dd6ddaae 100644 --- a/net/netfilter/ipvs/ip_vs_ftp.c +++ b/net/netfilter/ipvs/ip_vs_ftp.c @@ -157,6 +157,7 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, int ret = 0; enum ip_conntrack_info ctinfo; struct nf_conn *ct; + struct net *net; #ifdef CONFIG_IP_VS_IPV6 /* This application helper doesn't work with IPv6 yet, @@ -197,18 +198,20 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, */ { struct ip_vs_conn_param p; - ip_vs_conn_fill_param(AF_INET, iph->protocol, - &from, port, &cp->caddr, 0, &p); + ip_vs_conn_fill_param(ip_vs_conn_net(cp), AF_INET, + iph->protocol, &from, port, + &cp->caddr, 0, &p); n_cp = ip_vs_conn_out_get(&p); } if (!n_cp) { struct ip_vs_conn_param p; - ip_vs_conn_fill_param(AF_INET, IPPROTO_TCP, &cp->caddr, + ip_vs_conn_fill_param(ip_vs_conn_net(cp), + AF_INET, IPPROTO_TCP, &cp->caddr, 0, &cp->vaddr, port, &p); n_cp = ip_vs_conn_new(&p, &from, port, IP_VS_CONN_F_NO_CPORT | IP_VS_CONN_F_NFCT, - cp->dest); + cp->dest, skb->mark); if (!n_cp) return 0; @@ -257,8 +260,9 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, * would be adjusted twice. */ + net = skb_net(skb); cp->app_data = NULL; - ip_vs_tcp_conn_listen(n_cp); + ip_vs_tcp_conn_listen(net, n_cp); ip_vs_conn_put(n_cp); return ret; } @@ -287,6 +291,7 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp, union nf_inet_addr to; __be16 port; struct ip_vs_conn *n_cp; + struct net *net; #ifdef CONFIG_IP_VS_IPV6 /* This application helper doesn't work with IPv6 yet, @@ -358,14 +363,15 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp, { struct ip_vs_conn_param p; - ip_vs_conn_fill_param(AF_INET, iph->protocol, &to, port, - &cp->vaddr, htons(ntohs(cp->vport)-1), - &p); + ip_vs_conn_fill_param(ip_vs_conn_net(cp), AF_INET, + iph->protocol, &to, port, &cp->vaddr, + htons(ntohs(cp->vport)-1), &p); n_cp = ip_vs_conn_in_get(&p); if (!n_cp) { n_cp = ip_vs_conn_new(&p, &cp->daddr, htons(ntohs(cp->dport)-1), - IP_VS_CONN_F_NFCT, cp->dest); + IP_VS_CONN_F_NFCT, cp->dest, + skb->mark); if (!n_cp) return 0; @@ -377,7 +383,8 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp, /* * Move tunnel to listen state */ - ip_vs_tcp_conn_listen(n_cp); + net = skb_net(skb); + ip_vs_tcp_conn_listen(net, n_cp); ip_vs_conn_put(n_cp); return 1; @@ -398,23 +405,22 @@ static struct ip_vs_app ip_vs_ftp = { .pkt_in = ip_vs_ftp_in, }; - /* - * ip_vs_ftp initialization + * per netns ip_vs_ftp initialization */ -static int __init ip_vs_ftp_init(void) +static int __net_init __ip_vs_ftp_init(struct net *net) { int i, ret; struct ip_vs_app *app = &ip_vs_ftp; - ret = register_ip_vs_app(app); + ret = register_ip_vs_app(net, app); if (ret) return ret; for (i=0; i<IP_VS_APP_MAX_PORTS; i++) { if (!ports[i]) continue; - ret = register_ip_vs_app_inc(app, app->protocol, ports[i]); + ret = register_ip_vs_app_inc(net, app, app->protocol, ports[i]); if (ret) break; pr_info("%s: loaded support on port[%d] = %d\n", @@ -422,18 +428,39 @@ static int __init ip_vs_ftp_init(void) } if (ret) - unregister_ip_vs_app(app); + unregister_ip_vs_app(net, app); return ret; } +/* + * netns exit + */ +static void __ip_vs_ftp_exit(struct net *net) +{ + struct ip_vs_app *app = &ip_vs_ftp; + + unregister_ip_vs_app(net, app); +} + +static struct pernet_operations ip_vs_ftp_ops = { + .init = __ip_vs_ftp_init, + .exit = __ip_vs_ftp_exit, +}; +int __init ip_vs_ftp_init(void) +{ + int rv; + + rv = register_pernet_subsys(&ip_vs_ftp_ops); + return rv; +} /* * ip_vs_ftp finish. */ static void __exit ip_vs_ftp_exit(void) { - unregister_ip_vs_app(&ip_vs_ftp); + unregister_pernet_subsys(&ip_vs_ftp_ops); } diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c index 9323f894419..d5bec337187 100644 --- a/net/netfilter/ipvs/ip_vs_lblc.c +++ b/net/netfilter/ipvs/ip_vs_lblc.c @@ -70,7 +70,6 @@ * entries that haven't been touched for a day. */ #define COUNT_FOR_FULL_EXPIRATION 30 -static int sysctl_ip_vs_lblc_expiration = 24*60*60*HZ; /* @@ -117,7 +116,7 @@ struct ip_vs_lblc_table { static ctl_table vs_vars_table[] = { { .procname = "lblc_expiration", - .data = &sysctl_ip_vs_lblc_expiration, + .data = NULL, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, @@ -125,8 +124,6 @@ static ctl_table vs_vars_table[] = { { } }; -static struct ctl_table_header * sysctl_header; - static inline void ip_vs_lblc_free(struct ip_vs_lblc_entry *en) { list_del(&en->list); @@ -248,6 +245,7 @@ static inline void ip_vs_lblc_full_check(struct ip_vs_service *svc) struct ip_vs_lblc_entry *en, *nxt; unsigned long now = jiffies; int i, j; + struct netns_ipvs *ipvs = net_ipvs(svc->net); for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) { j = (j + 1) & IP_VS_LBLC_TAB_MASK; @@ -255,7 +253,8 @@ static inline void ip_vs_lblc_full_check(struct ip_vs_service *svc) write_lock(&svc->sched_lock); list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { if (time_before(now, - en->lastuse + sysctl_ip_vs_lblc_expiration)) + en->lastuse + + ipvs->sysctl_lblc_expiration)) continue; ip_vs_lblc_free(en); @@ -543,23 +542,73 @@ static struct ip_vs_scheduler ip_vs_lblc_scheduler = .schedule = ip_vs_lblc_schedule, }; +/* + * per netns init. + */ +static int __net_init __ip_vs_lblc_init(struct net *net) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + + if (!net_eq(net, &init_net)) { + ipvs->lblc_ctl_table = kmemdup(vs_vars_table, + sizeof(vs_vars_table), + GFP_KERNEL); + if (ipvs->lblc_ctl_table == NULL) + goto err_dup; + } else + ipvs->lblc_ctl_table = vs_vars_table; + ipvs->sysctl_lblc_expiration = 24*60*60*HZ; + ipvs->lblc_ctl_table[0].data = &ipvs->sysctl_lblc_expiration; + + ipvs->lblc_ctl_header = + register_net_sysctl_table(net, net_vs_ctl_path, + ipvs->lblc_ctl_table); + if (!ipvs->lblc_ctl_header) + goto err_reg; + + return 0; + +err_reg: + if (!net_eq(net, &init_net)) + kfree(ipvs->lblc_ctl_table); + +err_dup: + return -ENOMEM; +} + +static void __net_exit __ip_vs_lblc_exit(struct net *net) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + + unregister_net_sysctl_table(ipvs->lblc_ctl_header); + + if (!net_eq(net, &init_net)) + kfree(ipvs->lblc_ctl_table); +} + +static struct pernet_operations ip_vs_lblc_ops = { + .init = __ip_vs_lblc_init, + .exit = __ip_vs_lblc_exit, +}; static int __init ip_vs_lblc_init(void) { int ret; - sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars_table); + ret = register_pernet_subsys(&ip_vs_lblc_ops); + if (ret) + return ret; + ret = register_ip_vs_scheduler(&ip_vs_lblc_scheduler); if (ret) - unregister_sysctl_table(sysctl_header); + unregister_pernet_subsys(&ip_vs_lblc_ops); return ret; } - static void __exit ip_vs_lblc_cleanup(void) { - unregister_sysctl_table(sysctl_header); unregister_ip_vs_scheduler(&ip_vs_lblc_scheduler); + unregister_pernet_subsys(&ip_vs_lblc_ops); } diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c index dbeed8ea421..61ae8cfcf0b 100644 --- a/net/netfilter/ipvs/ip_vs_lblcr.c +++ b/net/netfilter/ipvs/ip_vs_lblcr.c @@ -70,8 +70,6 @@ * entries that haven't been touched for a day. */ #define COUNT_FOR_FULL_EXPIRATION 30 -static int sysctl_ip_vs_lblcr_expiration = 24*60*60*HZ; - /* * for IPVS lblcr entry hash table @@ -296,7 +294,7 @@ struct ip_vs_lblcr_table { static ctl_table vs_vars_table[] = { { .procname = "lblcr_expiration", - .data = &sysctl_ip_vs_lblcr_expiration, + .data = NULL, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, @@ -304,8 +302,6 @@ static ctl_table vs_vars_table[] = { { } }; -static struct ctl_table_header * sysctl_header; - static inline void ip_vs_lblcr_free(struct ip_vs_lblcr_entry *en) { list_del(&en->list); @@ -425,14 +421,15 @@ static inline void ip_vs_lblcr_full_check(struct ip_vs_service *svc) unsigned long now = jiffies; int i, j; struct ip_vs_lblcr_entry *en, *nxt; + struct netns_ipvs *ipvs = net_ipvs(svc->net); for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) { j = (j + 1) & IP_VS_LBLCR_TAB_MASK; write_lock(&svc->sched_lock); list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { - if (time_after(en->lastuse+sysctl_ip_vs_lblcr_expiration, - now)) + if (time_after(en->lastuse + + ipvs->sysctl_lblcr_expiration, now)) continue; ip_vs_lblcr_free(en); @@ -664,6 +661,7 @@ ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) read_lock(&svc->sched_lock); en = ip_vs_lblcr_get(svc->af, tbl, &iph.daddr); if (en) { + struct netns_ipvs *ipvs = net_ipvs(svc->net); /* We only hold a read lock, but this is atomic */ en->lastuse = jiffies; @@ -675,7 +673,7 @@ ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) /* More than one destination + enough time passed by, cleanup */ if (atomic_read(&en->set.size) > 1 && time_after(jiffies, en->set.lastmod + - sysctl_ip_vs_lblcr_expiration)) { + ipvs->sysctl_lblcr_expiration)) { struct ip_vs_dest *m; write_lock(&en->set.lock); @@ -744,23 +742,73 @@ static struct ip_vs_scheduler ip_vs_lblcr_scheduler = .schedule = ip_vs_lblcr_schedule, }; +/* + * per netns init. + */ +static int __net_init __ip_vs_lblcr_init(struct net *net) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + + if (!net_eq(net, &init_net)) { + ipvs->lblcr_ctl_table = kmemdup(vs_vars_table, + sizeof(vs_vars_table), + GFP_KERNEL); + if (ipvs->lblcr_ctl_table == NULL) + goto err_dup; + } else + ipvs->lblcr_ctl_table = vs_vars_table; + ipvs->sysctl_lblcr_expiration = 24*60*60*HZ; + ipvs->lblcr_ctl_table[0].data = &ipvs->sysctl_lblcr_expiration; + + ipvs->lblcr_ctl_header = + register_net_sysctl_table(net, net_vs_ctl_path, + ipvs->lblcr_ctl_table); + if (!ipvs->lblcr_ctl_header) + goto err_reg; + + return 0; + +err_reg: + if (!net_eq(net, &init_net)) + kfree(ipvs->lblcr_ctl_table); + +err_dup: + return -ENOMEM; +} + +static void __net_exit __ip_vs_lblcr_exit(struct net *net) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + + unregister_net_sysctl_table(ipvs->lblcr_ctl_header); + + if (!net_eq(net, &init_net)) + kfree(ipvs->lblcr_ctl_table); +} + +static struct pernet_operations ip_vs_lblcr_ops = { + .init = __ip_vs_lblcr_init, + .exit = __ip_vs_lblcr_exit, +}; static int __init ip_vs_lblcr_init(void) { int ret; - sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars_table); + ret = register_pernet_subsys(&ip_vs_lblcr_ops); + if (ret) + return ret; + ret = register_ip_vs_scheduler(&ip_vs_lblcr_scheduler); if (ret) - unregister_sysctl_table(sysctl_header); + unregister_pernet_subsys(&ip_vs_lblcr_ops); return ret; } - static void __exit ip_vs_lblcr_cleanup(void) { - unregister_sysctl_table(sysctl_header); unregister_ip_vs_scheduler(&ip_vs_lblcr_scheduler); + unregister_pernet_subsys(&ip_vs_lblcr_ops); } diff --git a/net/netfilter/ipvs/ip_vs_nfct.c b/net/netfilter/ipvs/ip_vs_nfct.c index 4680647cd45..f454c80df0a 100644 --- a/net/netfilter/ipvs/ip_vs_nfct.c +++ b/net/netfilter/ipvs/ip_vs_nfct.c @@ -141,6 +141,7 @@ static void ip_vs_nfct_expect_callback(struct nf_conn *ct, struct nf_conntrack_tuple *orig, new_reply; struct ip_vs_conn *cp; struct ip_vs_conn_param p; + struct net *net = nf_ct_net(ct); if (exp->tuple.src.l3num != PF_INET) return; @@ -155,7 +156,7 @@ static void ip_vs_nfct_expect_callback(struct nf_conn *ct, /* RS->CLIENT */ orig = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; - ip_vs_conn_fill_param(exp->tuple.src.l3num, orig->dst.protonum, + ip_vs_conn_fill_param(net, exp->tuple.src.l3num, orig->dst.protonum, &orig->src.u3, orig->src.u.tcp.port, &orig->dst.u3, orig->dst.u.tcp.port, &p); cp = ip_vs_conn_out_get(&p); @@ -268,7 +269,8 @@ void ip_vs_conn_drop_conntrack(struct ip_vs_conn *cp) " for conn " FMT_CONN "\n", __func__, ARG_TUPLE(&tuple), ARG_CONN(cp)); - h = nf_conntrack_find_get(&init_net, NF_CT_DEFAULT_ZONE, &tuple); + h = nf_conntrack_find_get(ip_vs_conn_net(cp), NF_CT_DEFAULT_ZONE, + &tuple); if (h) { ct = nf_ct_tuplehash_to_ctrack(h); /* Show what happens instead of calling nf_ct_kill() */ diff --git a/net/netfilter/ipvs/ip_vs_pe.c b/net/netfilter/ipvs/ip_vs_pe.c index 3414af70ee1..5cf859ccb31 100644 --- a/net/netfilter/ipvs/ip_vs_pe.c +++ b/net/netfilter/ipvs/ip_vs_pe.c @@ -29,12 +29,11 @@ void ip_vs_unbind_pe(struct ip_vs_service *svc) } /* Get pe in the pe list by name */ -static struct ip_vs_pe * -ip_vs_pe_getbyname(const char *pe_name) +struct ip_vs_pe *__ip_vs_pe_getbyname(const char *pe_name) { struct ip_vs_pe *pe; - IP_VS_DBG(2, "%s(): pe_name \"%s\"\n", __func__, + IP_VS_DBG(10, "%s(): pe_name \"%s\"\n", __func__, pe_name); spin_lock_bh(&ip_vs_pe_lock); @@ -60,28 +59,22 @@ ip_vs_pe_getbyname(const char *pe_name) } /* Lookup pe and try to load it if it doesn't exist */ -struct ip_vs_pe *ip_vs_pe_get(const char *name) +struct ip_vs_pe *ip_vs_pe_getbyname(const char *name) { struct ip_vs_pe *pe; /* Search for the pe by name */ - pe = ip_vs_pe_getbyname(name); + pe = __ip_vs_pe_getbyname(name); /* If pe not found, load the module and search again */ if (!pe) { request_module("ip_vs_pe_%s", name); - pe = ip_vs_pe_getbyname(name); + pe = __ip_vs_pe_getbyname(name); } return pe; } -void ip_vs_pe_put(struct ip_vs_pe *pe) -{ - if (pe && pe->module) - module_put(pe->module); -} - /* Register a pe in the pe list */ int register_ip_vs_pe(struct ip_vs_pe *pe) { diff --git a/net/netfilter/ipvs/ip_vs_pe_sip.c b/net/netfilter/ipvs/ip_vs_pe_sip.c index b8b4e9620f3..0d83bc01fed 100644 --- a/net/netfilter/ipvs/ip_vs_pe_sip.c +++ b/net/netfilter/ipvs/ip_vs_pe_sip.c @@ -71,6 +71,7 @@ ip_vs_sip_fill_param(struct ip_vs_conn_param *p, struct sk_buff *skb) struct ip_vs_iphdr iph; unsigned int dataoff, datalen, matchoff, matchlen; const char *dptr; + int retc; ip_vs_fill_iphdr(p->af, skb_network_header(skb), &iph); @@ -83,6 +84,8 @@ ip_vs_sip_fill_param(struct ip_vs_conn_param *p, struct sk_buff *skb) if (dataoff >= skb->len) return -EINVAL; + if ((retc=skb_linearize(skb)) < 0) + return retc; dptr = skb->data + dataoff; datalen = skb->len - dataoff; diff --git a/net/netfilter/ipvs/ip_vs_proto.c b/net/netfilter/ipvs/ip_vs_proto.c index c5399839087..6ac986cdcff 100644 --- a/net/netfilter/ipvs/ip_vs_proto.c +++ b/net/netfilter/ipvs/ip_vs_proto.c @@ -60,6 +60,31 @@ static int __used __init register_ip_vs_protocol(struct ip_vs_protocol *pp) return 0; } +/* + * register an ipvs protocols netns related data + */ +static int +register_ip_vs_proto_netns(struct net *net, struct ip_vs_protocol *pp) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + unsigned hash = IP_VS_PROTO_HASH(pp->protocol); + struct ip_vs_proto_data *pd = + kzalloc(sizeof(struct ip_vs_proto_data), GFP_ATOMIC); + + if (!pd) { + pr_err("%s(): no memory.\n", __func__); + return -ENOMEM; + } + pd->pp = pp; /* For speed issues */ + pd->next = ipvs->proto_data_table[hash]; + ipvs->proto_data_table[hash] = pd; + atomic_set(&pd->appcnt, 0); /* Init app counter */ + + if (pp->init_netns != NULL) + pp->init_netns(net, pd); + + return 0; +} /* * unregister an ipvs protocol @@ -82,6 +107,29 @@ static int unregister_ip_vs_protocol(struct ip_vs_protocol *pp) return -ESRCH; } +/* + * unregister an ipvs protocols netns data + */ +static int +unregister_ip_vs_proto_netns(struct net *net, struct ip_vs_proto_data *pd) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + struct ip_vs_proto_data **pd_p; + unsigned hash = IP_VS_PROTO_HASH(pd->pp->protocol); + + pd_p = &ipvs->proto_data_table[hash]; + for (; *pd_p; pd_p = &(*pd_p)->next) { + if (*pd_p == pd) { + *pd_p = pd->next; + if (pd->pp->exit_netns != NULL) + pd->pp->exit_netns(net, pd); + kfree(pd); + return 0; + } + } + + return -ESRCH; +} /* * get ip_vs_protocol object by its proto. @@ -100,19 +148,44 @@ struct ip_vs_protocol * ip_vs_proto_get(unsigned short proto) } EXPORT_SYMBOL(ip_vs_proto_get); +/* + * get ip_vs_protocol object data by netns and proto + */ +struct ip_vs_proto_data * +__ipvs_proto_data_get(struct netns_ipvs *ipvs, unsigned short proto) +{ + struct ip_vs_proto_data *pd; + unsigned hash = IP_VS_PROTO_HASH(proto); + + for (pd = ipvs->proto_data_table[hash]; pd; pd = pd->next) { + if (pd->pp->protocol == proto) + return pd; + } + + return NULL; +} + +struct ip_vs_proto_data * +ip_vs_proto_data_get(struct net *net, unsigned short proto) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + + return __ipvs_proto_data_get(ipvs, proto); +} +EXPORT_SYMBOL(ip_vs_proto_data_get); /* * Propagate event for state change to all protocols */ -void ip_vs_protocol_timeout_change(int flags) +void ip_vs_protocol_timeout_change(struct netns_ipvs *ipvs, int flags) { - struct ip_vs_protocol *pp; + struct ip_vs_proto_data *pd; int i; for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) { - for (pp = ip_vs_proto_table[i]; pp; pp = pp->next) { - if (pp->timeout_change) - pp->timeout_change(pp, flags); + for (pd = ipvs->proto_data_table[i]; pd; pd = pd->next) { + if (pd->pp->timeout_change) + pd->pp->timeout_change(pd, flags); } } } @@ -236,6 +309,46 @@ ip_vs_tcpudp_debug_packet(int af, struct ip_vs_protocol *pp, ip_vs_tcpudp_debug_packet_v4(pp, skb, offset, msg); } +/* + * per network name-space init + */ +static int __net_init __ip_vs_protocol_init(struct net *net) +{ +#ifdef CONFIG_IP_VS_PROTO_TCP + register_ip_vs_proto_netns(net, &ip_vs_protocol_tcp); +#endif +#ifdef CONFIG_IP_VS_PROTO_UDP + register_ip_vs_proto_netns(net, &ip_vs_protocol_udp); +#endif +#ifdef CONFIG_IP_VS_PROTO_SCTP + register_ip_vs_proto_netns(net, &ip_vs_protocol_sctp); +#endif +#ifdef CONFIG_IP_VS_PROTO_AH + register_ip_vs_proto_netns(net, &ip_vs_protocol_ah); +#endif +#ifdef CONFIG_IP_VS_PROTO_ESP + register_ip_vs_proto_netns(net, &ip_vs_protocol_esp); +#endif + return 0; +} + +static void __net_exit __ip_vs_protocol_cleanup(struct net *net) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + struct ip_vs_proto_data *pd; + int i; + + /* unregister all the ipvs proto data for this netns */ + for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) { + while ((pd = ipvs->proto_data_table[i]) != NULL) + unregister_ip_vs_proto_netns(net, pd); + } +} + +static struct pernet_operations ipvs_proto_ops = { + .init = __ip_vs_protocol_init, + .exit = __ip_vs_protocol_cleanup, +}; int __init ip_vs_protocol_init(void) { @@ -265,6 +378,7 @@ int __init ip_vs_protocol_init(void) REGISTER_PROTOCOL(&ip_vs_protocol_esp); #endif pr_info("Registered protocols (%s)\n", &protocols[2]); + return register_pernet_subsys(&ipvs_proto_ops); return 0; } @@ -275,6 +389,7 @@ void ip_vs_protocol_cleanup(void) struct ip_vs_protocol *pp; int i; + unregister_pernet_subsys(&ipvs_proto_ops); /* unregister all the ipvs protocols */ for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) { while ((pp = ip_vs_proto_table[i]) != NULL) diff --git a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c index 3a0461117d3..5b8eb8b12c3 100644 --- a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c +++ b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c @@ -41,28 +41,30 @@ struct isakmp_hdr { #define PORT_ISAKMP 500 static void -ah_esp_conn_fill_param_proto(int af, const struct ip_vs_iphdr *iph, - int inverse, struct ip_vs_conn_param *p) +ah_esp_conn_fill_param_proto(struct net *net, int af, + const struct ip_vs_iphdr *iph, int inverse, + struct ip_vs_conn_param *p) { if (likely(!inverse)) - ip_vs_conn_fill_param(af, IPPROTO_UDP, + ip_vs_conn_fill_param(net, af, IPPROTO_UDP, &iph->saddr, htons(PORT_ISAKMP), &iph->daddr, htons(PORT_ISAKMP), p); else - ip_vs_conn_fill_param(af, IPPROTO_UDP, + ip_vs_conn_fill_param(net, af, IPPROTO_UDP, &iph->daddr, htons(PORT_ISAKMP), &iph->saddr, htons(PORT_ISAKMP), p); } static struct ip_vs_conn * -ah_esp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp, +ah_esp_conn_in_get(int af, const struct sk_buff *skb, const struct ip_vs_iphdr *iph, unsigned int proto_off, int inverse) { struct ip_vs_conn *cp; struct ip_vs_conn_param p; + struct net *net = skb_net(skb); - ah_esp_conn_fill_param_proto(af, iph, inverse, &p); + ah_esp_conn_fill_param_proto(net, af, iph, inverse, &p); cp = ip_vs_conn_in_get(&p); if (!cp) { /* @@ -72,7 +74,7 @@ ah_esp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp, IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for outin packet " "%s%s %s->%s\n", inverse ? "ICMP+" : "", - pp->name, + ip_vs_proto_get(iph->protocol)->name, IP_VS_DBG_ADDR(af, &iph->saddr), IP_VS_DBG_ADDR(af, &iph->daddr)); } @@ -83,21 +85,21 @@ ah_esp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp, static struct ip_vs_conn * ah_esp_conn_out_get(int af, const struct sk_buff *skb, - struct ip_vs_protocol *pp, const struct ip_vs_iphdr *iph, unsigned int proto_off, int inverse) { struct ip_vs_conn *cp; struct ip_vs_conn_param p; + struct net *net = skb_net(skb); - ah_esp_conn_fill_param_proto(af, iph, inverse, &p); + ah_esp_conn_fill_param_proto(net, af, iph, inverse, &p); cp = ip_vs_conn_out_get(&p); if (!cp) { IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for inout packet " "%s%s %s->%s\n", inverse ? "ICMP+" : "", - pp->name, + ip_vs_proto_get(iph->protocol)->name, IP_VS_DBG_ADDR(af, &iph->saddr), IP_VS_DBG_ADDR(af, &iph->daddr)); } @@ -107,7 +109,7 @@ ah_esp_conn_out_get(int af, const struct sk_buff *skb, static int -ah_esp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, +ah_esp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, int *verdict, struct ip_vs_conn **cpp) { /* @@ -117,26 +119,14 @@ ah_esp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, return 0; } -static void ah_esp_init(struct ip_vs_protocol *pp) -{ - /* nothing to do now */ -} - - -static void ah_esp_exit(struct ip_vs_protocol *pp) -{ - /* nothing to do now */ -} - - #ifdef CONFIG_IP_VS_PROTO_AH struct ip_vs_protocol ip_vs_protocol_ah = { .name = "AH", .protocol = IPPROTO_AH, .num_states = 1, .dont_defrag = 1, - .init = ah_esp_init, - .exit = ah_esp_exit, + .init = NULL, + .exit = NULL, .conn_schedule = ah_esp_conn_schedule, .conn_in_get = ah_esp_conn_in_get, .conn_out_get = ah_esp_conn_out_get, @@ -149,7 +139,6 @@ struct ip_vs_protocol ip_vs_protocol_ah = { .app_conn_bind = NULL, .debug_packet = ip_vs_tcpudp_debug_packet, .timeout_change = NULL, /* ISAKMP */ - .set_state_timeout = NULL, }; #endif @@ -159,8 +148,8 @@ struct ip_vs_protocol ip_vs_protocol_esp = { .protocol = IPPROTO_ESP, .num_states = 1, .dont_defrag = 1, - .init = ah_esp_init, - .exit = ah_esp_exit, + .init = NULL, + .exit = NULL, .conn_schedule = ah_esp_conn_schedule, .conn_in_get = ah_esp_conn_in_get, .conn_out_get = ah_esp_conn_out_get, diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c index 1ea96bcd342..fb2d04ac5d4 100644 --- a/net/netfilter/ipvs/ip_vs_proto_sctp.c +++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c @@ -9,9 +9,10 @@ #include <net/ip_vs.h> static int -sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, +sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, int *verdict, struct ip_vs_conn **cpp) { + struct net *net; struct ip_vs_service *svc; sctp_chunkhdr_t _schunkh, *sch; sctp_sctphdr_t *sh, _sctph; @@ -27,13 +28,13 @@ sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, sizeof(_schunkh), &_schunkh); if (sch == NULL) return 0; - + net = skb_net(skb); if ((sch->type == SCTP_CID_INIT) && - (svc = ip_vs_service_get(af, skb->mark, iph.protocol, + (svc = ip_vs_service_get(net, af, skb->mark, iph.protocol, &iph.daddr, sh->dest))) { int ignored; - if (ip_vs_todrop()) { + if (ip_vs_todrop(net_ipvs(net))) { /* * It seems that we are very loaded. * We have to drop this packet :( @@ -46,14 +47,19 @@ sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, * Let the virtual server select a real server for the * incoming connection, and create a connection entry. */ - *cpp = ip_vs_schedule(svc, skb, pp, &ignored); - if (!*cpp && !ignored) { - *verdict = ip_vs_leave(svc, skb, pp); + *cpp = ip_vs_schedule(svc, skb, pd, &ignored); + if (!*cpp && ignored <= 0) { + if (!ignored) + *verdict = ip_vs_leave(svc, skb, pd); + else { + ip_vs_service_put(svc); + *verdict = NF_DROP; + } return 0; } ip_vs_service_put(svc); } - + /* NF_ACCEPT */ return 1; } @@ -856,7 +862,7 @@ static struct ipvs_sctp_nextstate /* * Timeout table[state] */ -static int sctp_timeouts[IP_VS_SCTP_S_LAST + 1] = { +static const int sctp_timeouts[IP_VS_SCTP_S_LAST + 1] = { [IP_VS_SCTP_S_NONE] = 2 * HZ, [IP_VS_SCTP_S_INIT_CLI] = 1 * 60 * HZ, [IP_VS_SCTP_S_INIT_SER] = 1 * 60 * HZ, @@ -900,20 +906,8 @@ static const char *sctp_state_name(int state) return "?"; } -static void sctp_timeout_change(struct ip_vs_protocol *pp, int flags) -{ -} - -static int -sctp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to) -{ - -return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_SCTP_S_LAST, - sctp_state_name_table, sname, to); -} - static inline int -set_sctp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp, +set_sctp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp, int direction, const struct sk_buff *skb) { sctp_chunkhdr_t _sctpch, *sch; @@ -971,7 +965,7 @@ set_sctp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp, IP_VS_DBG_BUF(8, "%s %s %s:%d->" "%s:%d state: %s->%s conn->refcnt:%d\n", - pp->name, + pd->pp->name, ((direction == IP_VS_DIR_OUTPUT) ? "output " : "input "), IP_VS_DBG_ADDR(cp->af, &cp->daddr), @@ -995,75 +989,73 @@ set_sctp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp, } } } + if (likely(pd)) + cp->timeout = pd->timeout_table[cp->state = next_state]; + else /* What to do ? */ + cp->timeout = sctp_timeouts[cp->state = next_state]; - cp->timeout = pp->timeout_table[cp->state = next_state]; - - return 1; + return 1; } static int sctp_state_transition(struct ip_vs_conn *cp, int direction, - const struct sk_buff *skb, struct ip_vs_protocol *pp) + const struct sk_buff *skb, struct ip_vs_proto_data *pd) { int ret = 0; spin_lock(&cp->lock); - ret = set_sctp_state(pp, cp, direction, skb); + ret = set_sctp_state(pd, cp, direction, skb); spin_unlock(&cp->lock); return ret; } -/* - * Hash table for SCTP application incarnations - */ -#define SCTP_APP_TAB_BITS 4 -#define SCTP_APP_TAB_SIZE (1 << SCTP_APP_TAB_BITS) -#define SCTP_APP_TAB_MASK (SCTP_APP_TAB_SIZE - 1) - -static struct list_head sctp_apps[SCTP_APP_TAB_SIZE]; -static DEFINE_SPINLOCK(sctp_app_lock); - static inline __u16 sctp_app_hashkey(__be16 port) { return (((__force u16)port >> SCTP_APP_TAB_BITS) ^ (__force u16)port) & SCTP_APP_TAB_MASK; } -static int sctp_register_app(struct ip_vs_app *inc) +static int sctp_register_app(struct net *net, struct ip_vs_app *inc) { struct ip_vs_app *i; __u16 hash; __be16 port = inc->port; int ret = 0; + struct netns_ipvs *ipvs = net_ipvs(net); + struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_SCTP); hash = sctp_app_hashkey(port); - spin_lock_bh(&sctp_app_lock); - list_for_each_entry(i, &sctp_apps[hash], p_list) { + spin_lock_bh(&ipvs->sctp_app_lock); + list_for_each_entry(i, &ipvs->sctp_apps[hash], p_list) { if (i->port == port) { ret = -EEXIST; goto out; } } - list_add(&inc->p_list, &sctp_apps[hash]); - atomic_inc(&ip_vs_protocol_sctp.appcnt); + list_add(&inc->p_list, &ipvs->sctp_apps[hash]); + atomic_inc(&pd->appcnt); out: - spin_unlock_bh(&sctp_app_lock); + spin_unlock_bh(&ipvs->sctp_app_lock); return ret; } -static void sctp_unregister_app(struct ip_vs_app *inc) +static void sctp_unregister_app(struct net *net, struct ip_vs_app *inc) { - spin_lock_bh(&sctp_app_lock); - atomic_dec(&ip_vs_protocol_sctp.appcnt); + struct netns_ipvs *ipvs = net_ipvs(net); + struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_SCTP); + + spin_lock_bh(&ipvs->sctp_app_lock); + atomic_dec(&pd->appcnt); list_del(&inc->p_list); - spin_unlock_bh(&sctp_app_lock); + spin_unlock_bh(&ipvs->sctp_app_lock); } static int sctp_app_conn_bind(struct ip_vs_conn *cp) { + struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(cp)); int hash; struct ip_vs_app *inc; int result = 0; @@ -1074,12 +1066,12 @@ static int sctp_app_conn_bind(struct ip_vs_conn *cp) /* Lookup application incarnations and bind the right one */ hash = sctp_app_hashkey(cp->vport); - spin_lock(&sctp_app_lock); - list_for_each_entry(inc, &sctp_apps[hash], p_list) { + spin_lock(&ipvs->sctp_app_lock); + list_for_each_entry(inc, &ipvs->sctp_apps[hash], p_list) { if (inc->port == cp->vport) { if (unlikely(!ip_vs_app_inc_get(inc))) break; - spin_unlock(&sctp_app_lock); + spin_unlock(&ipvs->sctp_app_lock); IP_VS_DBG_BUF(9, "%s: Binding conn %s:%u->" "%s:%u to app %s on port %u\n", @@ -1095,43 +1087,50 @@ static int sctp_app_conn_bind(struct ip_vs_conn *cp) goto out; } } - spin_unlock(&sctp_app_lock); + spin_unlock(&ipvs->sctp_app_lock); out: return result; } -static void ip_vs_sctp_init(struct ip_vs_protocol *pp) +/* --------------------------------------------- + * timeouts is netns related now. + * --------------------------------------------- + */ +static void __ip_vs_sctp_init(struct net *net, struct ip_vs_proto_data *pd) { - IP_VS_INIT_HASH_TABLE(sctp_apps); - pp->timeout_table = sctp_timeouts; -} + struct netns_ipvs *ipvs = net_ipvs(net); + ip_vs_init_hash_table(ipvs->sctp_apps, SCTP_APP_TAB_SIZE); + spin_lock_init(&ipvs->tcp_app_lock); + pd->timeout_table = ip_vs_create_timeout_table((int *)sctp_timeouts, + sizeof(sctp_timeouts)); +} -static void ip_vs_sctp_exit(struct ip_vs_protocol *pp) +static void __ip_vs_sctp_exit(struct net *net, struct ip_vs_proto_data *pd) { - + kfree(pd->timeout_table); } struct ip_vs_protocol ip_vs_protocol_sctp = { - .name = "SCTP", - .protocol = IPPROTO_SCTP, - .num_states = IP_VS_SCTP_S_LAST, - .dont_defrag = 0, - .appcnt = ATOMIC_INIT(0), - .init = ip_vs_sctp_init, - .exit = ip_vs_sctp_exit, - .register_app = sctp_register_app, + .name = "SCTP", + .protocol = IPPROTO_SCTP, + .num_states = IP_VS_SCTP_S_LAST, + .dont_defrag = 0, + .init = NULL, + .exit = NULL, + .init_netns = __ip_vs_sctp_init, + .exit_netns = __ip_vs_sctp_exit, + .register_app = sctp_register_app, .unregister_app = sctp_unregister_app, - .conn_schedule = sctp_conn_schedule, - .conn_in_get = ip_vs_conn_in_get_proto, - .conn_out_get = ip_vs_conn_out_get_proto, - .snat_handler = sctp_snat_handler, - .dnat_handler = sctp_dnat_handler, - .csum_check = sctp_csum_check, - .state_name = sctp_state_name, + .conn_schedule = sctp_conn_schedule, + .conn_in_get = ip_vs_conn_in_get_proto, + .conn_out_get = ip_vs_conn_out_get_proto, + .snat_handler = sctp_snat_handler, + .dnat_handler = sctp_dnat_handler, + .csum_check = sctp_csum_check, + .state_name = sctp_state_name, .state_transition = sctp_state_transition, - .app_conn_bind = sctp_app_conn_bind, - .debug_packet = ip_vs_tcpudp_debug_packet, - .timeout_change = sctp_timeout_change, - .set_state_timeout = sctp_set_state_timeout, + .app_conn_bind = sctp_app_conn_bind, + .debug_packet = ip_vs_tcpudp_debug_packet, + .timeout_change = NULL, }; diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c index f6c5200e214..c0cc341b840 100644 --- a/net/netfilter/ipvs/ip_vs_proto_tcp.c +++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c @@ -9,8 +9,12 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. * - * Changes: + * Changes: Hans Schillstrom <hans.schillstrom@ericsson.com> * + * Network name space (netns) aware. + * Global data moved to netns i.e struct netns_ipvs + * tcp_timeouts table has copy per netns in a hash table per + * protocol ip_vs_proto_data and is handled by netns */ #define KMSG_COMPONENT "IPVS" @@ -28,9 +32,10 @@ #include <net/ip_vs.h> static int -tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, +tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, int *verdict, struct ip_vs_conn **cpp) { + struct net *net; struct ip_vs_service *svc; struct tcphdr _tcph, *th; struct ip_vs_iphdr iph; @@ -42,14 +47,14 @@ tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, *verdict = NF_DROP; return 0; } - + net = skb_net(skb); /* No !th->ack check to allow scheduling on SYN+ACK for Active FTP */ if (th->syn && - (svc = ip_vs_service_get(af, skb->mark, iph.protocol, &iph.daddr, - th->dest))) { + (svc = ip_vs_service_get(net, af, skb->mark, iph.protocol, + &iph.daddr, th->dest))) { int ignored; - if (ip_vs_todrop()) { + if (ip_vs_todrop(net_ipvs(net))) { /* * It seems that we are very loaded. * We have to drop this packet :( @@ -63,13 +68,19 @@ tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, * Let the virtual server select a real server for the * incoming connection, and create a connection entry. */ - *cpp = ip_vs_schedule(svc, skb, pp, &ignored); - if (!*cpp && !ignored) { - *verdict = ip_vs_leave(svc, skb, pp); + *cpp = ip_vs_schedule(svc, skb, pd, &ignored); + if (!*cpp && ignored <= 0) { + if (!ignored) + *verdict = ip_vs_leave(svc, skb, pd); + else { + ip_vs_service_put(svc); + *verdict = NF_DROP; + } return 0; } ip_vs_service_put(svc); } + /* NF_ACCEPT */ return 1; } @@ -338,7 +349,7 @@ static const int tcp_state_off[IP_VS_DIR_LAST] = { /* * Timeout table[state] */ -static int tcp_timeouts[IP_VS_TCP_S_LAST+1] = { +static const int tcp_timeouts[IP_VS_TCP_S_LAST+1] = { [IP_VS_TCP_S_NONE] = 2*HZ, [IP_VS_TCP_S_ESTABLISHED] = 15*60*HZ, [IP_VS_TCP_S_SYN_SENT] = 2*60*HZ, @@ -437,10 +448,7 @@ static struct tcp_states_t tcp_states_dos [] = { /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }}, }; -static struct tcp_states_t *tcp_state_table = tcp_states; - - -static void tcp_timeout_change(struct ip_vs_protocol *pp, int flags) +static void tcp_timeout_change(struct ip_vs_proto_data *pd, int flags) { int on = (flags & 1); /* secure_tcp */ @@ -450,14 +458,7 @@ static void tcp_timeout_change(struct ip_vs_protocol *pp, int flags) ** for most if not for all of the applications. Something ** like "capabilities" (flags) for each object. */ - tcp_state_table = (on? tcp_states_dos : tcp_states); -} - -static int -tcp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to) -{ - return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_TCP_S_LAST, - tcp_state_name_table, sname, to); + pd->tcp_state_table = (on ? tcp_states_dos : tcp_states); } static inline int tcp_state_idx(struct tcphdr *th) @@ -474,7 +475,7 @@ static inline int tcp_state_idx(struct tcphdr *th) } static inline void -set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp, +set_tcp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp, int direction, struct tcphdr *th) { int state_idx; @@ -497,7 +498,8 @@ set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp, goto tcp_state_out; } - new_state = tcp_state_table[state_off+state_idx].next_state[cp->state]; + new_state = + pd->tcp_state_table[state_off+state_idx].next_state[cp->state]; tcp_state_out: if (new_state != cp->state) { @@ -505,7 +507,7 @@ set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp, IP_VS_DBG_BUF(8, "%s %s [%c%c%c%c] %s:%d->" "%s:%d state: %s->%s conn->refcnt:%d\n", - pp->name, + pd->pp->name, ((state_off == TCP_DIR_OUTPUT) ? "output " : "input "), th->syn ? 'S' : '.', @@ -535,17 +537,19 @@ set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp, } } - cp->timeout = pp->timeout_table[cp->state = new_state]; + if (likely(pd)) + cp->timeout = pd->timeout_table[cp->state = new_state]; + else /* What to do ? */ + cp->timeout = tcp_timeouts[cp->state = new_state]; } - /* * Handle state transitions */ static int tcp_state_transition(struct ip_vs_conn *cp, int direction, const struct sk_buff *skb, - struct ip_vs_protocol *pp) + struct ip_vs_proto_data *pd) { struct tcphdr _tcph, *th; @@ -560,23 +564,12 @@ tcp_state_transition(struct ip_vs_conn *cp, int direction, return 0; spin_lock(&cp->lock); - set_tcp_state(pp, cp, direction, th); + set_tcp_state(pd, cp, direction, th); spin_unlock(&cp->lock); return 1; } - -/* - * Hash table for TCP application incarnations - */ -#define TCP_APP_TAB_BITS 4 -#define TCP_APP_TAB_SIZE (1 << TCP_APP_TAB_BITS) -#define TCP_APP_TAB_MASK (TCP_APP_TAB_SIZE - 1) - -static struct list_head tcp_apps[TCP_APP_TAB_SIZE]; -static DEFINE_SPINLOCK(tcp_app_lock); - static inline __u16 tcp_app_hashkey(__be16 port) { return (((__force u16)port >> TCP_APP_TAB_BITS) ^ (__force u16)port) @@ -584,44 +577,50 @@ static inline __u16 tcp_app_hashkey(__be16 port) } -static int tcp_register_app(struct ip_vs_app *inc) +static int tcp_register_app(struct net *net, struct ip_vs_app *inc) { struct ip_vs_app *i; __u16 hash; __be16 port = inc->port; int ret = 0; + struct netns_ipvs *ipvs = net_ipvs(net); + struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP); hash = tcp_app_hashkey(port); - spin_lock_bh(&tcp_app_lock); - list_for_each_entry(i, &tcp_apps[hash], p_list) { + spin_lock_bh(&ipvs->tcp_app_lock); + list_for_each_entry(i, &ipvs->tcp_apps[hash], p_list) { if (i->port == port) { ret = -EEXIST; goto out; } } - list_add(&inc->p_list, &tcp_apps[hash]); - atomic_inc(&ip_vs_protocol_tcp.appcnt); + list_add(&inc->p_list, &ipvs->tcp_apps[hash]); + atomic_inc(&pd->appcnt); out: - spin_unlock_bh(&tcp_app_lock); + spin_unlock_bh(&ipvs->tcp_app_lock); return ret; } static void -tcp_unregister_app(struct ip_vs_app *inc) +tcp_unregister_app(struct net *net, struct ip_vs_app *inc) { - spin_lock_bh(&tcp_app_lock); - atomic_dec(&ip_vs_protocol_tcp.appcnt); + struct netns_ipvs *ipvs = net_ipvs(net); + struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP); + + spin_lock_bh(&ipvs->tcp_app_lock); + atomic_dec(&pd->appcnt); list_del(&inc->p_list); - spin_unlock_bh(&tcp_app_lock); + spin_unlock_bh(&ipvs->tcp_app_lock); } static int tcp_app_conn_bind(struct ip_vs_conn *cp) { + struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(cp)); int hash; struct ip_vs_app *inc; int result = 0; @@ -633,12 +632,12 @@ tcp_app_conn_bind(struct ip_vs_conn *cp) /* Lookup application incarnations and bind the right one */ hash = tcp_app_hashkey(cp->vport); - spin_lock(&tcp_app_lock); - list_for_each_entry(inc, &tcp_apps[hash], p_list) { + spin_lock(&ipvs->tcp_app_lock); + list_for_each_entry(inc, &ipvs->tcp_apps[hash], p_list) { if (inc->port == cp->vport) { if (unlikely(!ip_vs_app_inc_get(inc))) break; - spin_unlock(&tcp_app_lock); + spin_unlock(&ipvs->tcp_app_lock); IP_VS_DBG_BUF(9, "%s(): Binding conn %s:%u->" "%s:%u to app %s on port %u\n", @@ -655,7 +654,7 @@ tcp_app_conn_bind(struct ip_vs_conn *cp) goto out; } } - spin_unlock(&tcp_app_lock); + spin_unlock(&ipvs->tcp_app_lock); out: return result; @@ -665,24 +664,35 @@ tcp_app_conn_bind(struct ip_vs_conn *cp) /* * Set LISTEN timeout. (ip_vs_conn_put will setup timer) */ -void ip_vs_tcp_conn_listen(struct ip_vs_conn *cp) +void ip_vs_tcp_conn_listen(struct net *net, struct ip_vs_conn *cp) { + struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP); + spin_lock(&cp->lock); cp->state = IP_VS_TCP_S_LISTEN; - cp->timeout = ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_LISTEN]; + cp->timeout = (pd ? pd->timeout_table[IP_VS_TCP_S_LISTEN] + : tcp_timeouts[IP_VS_TCP_S_LISTEN]); spin_unlock(&cp->lock); } - -static void ip_vs_tcp_init(struct ip_vs_protocol *pp) +/* --------------------------------------------- + * timeouts is netns related now. + * --------------------------------------------- + */ +static void __ip_vs_tcp_init(struct net *net, struct ip_vs_proto_data *pd) { - IP_VS_INIT_HASH_TABLE(tcp_apps); - pp->timeout_table = tcp_timeouts; -} + struct netns_ipvs *ipvs = net_ipvs(net); + ip_vs_init_hash_table(ipvs->tcp_apps, TCP_APP_TAB_SIZE); + spin_lock_init(&ipvs->tcp_app_lock); + pd->timeout_table = ip_vs_create_timeout_table((int *)tcp_timeouts, + sizeof(tcp_timeouts)); + pd->tcp_state_table = tcp_states; +} -static void ip_vs_tcp_exit(struct ip_vs_protocol *pp) +static void __ip_vs_tcp_exit(struct net *net, struct ip_vs_proto_data *pd) { + kfree(pd->timeout_table); } @@ -691,9 +701,10 @@ struct ip_vs_protocol ip_vs_protocol_tcp = { .protocol = IPPROTO_TCP, .num_states = IP_VS_TCP_S_LAST, .dont_defrag = 0, - .appcnt = ATOMIC_INIT(0), - .init = ip_vs_tcp_init, - .exit = ip_vs_tcp_exit, + .init = NULL, + .exit = NULL, + .init_netns = __ip_vs_tcp_init, + .exit_netns = __ip_vs_tcp_exit, .register_app = tcp_register_app, .unregister_app = tcp_unregister_app, .conn_schedule = tcp_conn_schedule, @@ -707,5 +718,4 @@ struct ip_vs_protocol ip_vs_protocol_tcp = { .app_conn_bind = tcp_app_conn_bind, .debug_packet = ip_vs_tcpudp_debug_packet, .timeout_change = tcp_timeout_change, - .set_state_timeout = tcp_set_state_timeout, }; diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c index 9d106a06bb0..f1282cbe6fe 100644 --- a/net/netfilter/ipvs/ip_vs_proto_udp.c +++ b/net/netfilter/ipvs/ip_vs_proto_udp.c @@ -9,7 +9,8 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. * - * Changes: + * Changes: Hans Schillstrom <hans.schillstrom@ericsson.com> + * Network name space (netns) aware. * */ @@ -28,9 +29,10 @@ #include <net/ip6_checksum.h> static int -udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, +udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, int *verdict, struct ip_vs_conn **cpp) { + struct net *net; struct ip_vs_service *svc; struct udphdr _udph, *uh; struct ip_vs_iphdr iph; @@ -42,13 +44,13 @@ udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, *verdict = NF_DROP; return 0; } - - svc = ip_vs_service_get(af, skb->mark, iph.protocol, + net = skb_net(skb); + svc = ip_vs_service_get(net, af, skb->mark, iph.protocol, &iph.daddr, uh->dest); if (svc) { int ignored; - if (ip_vs_todrop()) { + if (ip_vs_todrop(net_ipvs(net))) { /* * It seems that we are very loaded. * We have to drop this packet :( @@ -62,13 +64,19 @@ udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, * Let the virtual server select a real server for the * incoming connection, and create a connection entry. */ - *cpp = ip_vs_schedule(svc, skb, pp, &ignored); - if (!*cpp && !ignored) { - *verdict = ip_vs_leave(svc, skb, pp); + *cpp = ip_vs_schedule(svc, skb, pd, &ignored); + if (!*cpp && ignored <= 0) { + if (!ignored) + *verdict = ip_vs_leave(svc, skb, pd); + else { + ip_vs_service_put(svc); + *verdict = NF_DROP; + } return 0; } ip_vs_service_put(svc); } + /* NF_ACCEPT */ return 1; } @@ -338,19 +346,6 @@ udp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp) return 1; } - -/* - * Note: the caller guarantees that only one of register_app, - * unregister_app or app_conn_bind is called each time. - */ - -#define UDP_APP_TAB_BITS 4 -#define UDP_APP_TAB_SIZE (1 << UDP_APP_TAB_BITS) -#define UDP_APP_TAB_MASK (UDP_APP_TAB_SIZE - 1) - -static struct list_head udp_apps[UDP_APP_TAB_SIZE]; -static DEFINE_SPINLOCK(udp_app_lock); - static inline __u16 udp_app_hashkey(__be16 port) { return (((__force u16)port >> UDP_APP_TAB_BITS) ^ (__force u16)port) @@ -358,44 +353,50 @@ static inline __u16 udp_app_hashkey(__be16 port) } -static int udp_register_app(struct ip_vs_app *inc) +static int udp_register_app(struct net *net, struct ip_vs_app *inc) { struct ip_vs_app *i; __u16 hash; __be16 port = inc->port; int ret = 0; + struct netns_ipvs *ipvs = net_ipvs(net); + struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_UDP); hash = udp_app_hashkey(port); - spin_lock_bh(&udp_app_lock); - list_for_each_entry(i, &udp_apps[hash], p_list) { + spin_lock_bh(&ipvs->udp_app_lock); + list_for_each_entry(i, &ipvs->udp_apps[hash], p_list) { if (i->port == port) { ret = -EEXIST; goto out; } } - list_add(&inc->p_list, &udp_apps[hash]); - atomic_inc(&ip_vs_protocol_udp.appcnt); + list_add(&inc->p_list, &ipvs->udp_apps[hash]); + atomic_inc(&pd->appcnt); out: - spin_unlock_bh(&udp_app_lock); + spin_unlock_bh(&ipvs->udp_app_lock); return ret; } static void -udp_unregister_app(struct ip_vs_app *inc) +udp_unregister_app(struct net *net, struct ip_vs_app *inc) { - spin_lock_bh(&udp_app_lock); - atomic_dec(&ip_vs_protocol_udp.appcnt); + struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_UDP); + struct netns_ipvs *ipvs = net_ipvs(net); + + spin_lock_bh(&ipvs->udp_app_lock); + atomic_dec(&pd->appcnt); list_del(&inc->p_list); - spin_unlock_bh(&udp_app_lock); + spin_unlock_bh(&ipvs->udp_app_lock); } static int udp_app_conn_bind(struct ip_vs_conn *cp) { + struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(cp)); int hash; struct ip_vs_app *inc; int result = 0; @@ -407,12 +408,12 @@ static int udp_app_conn_bind(struct ip_vs_conn *cp) /* Lookup application incarnations and bind the right one */ hash = udp_app_hashkey(cp->vport); - spin_lock(&udp_app_lock); - list_for_each_entry(inc, &udp_apps[hash], p_list) { + spin_lock(&ipvs->udp_app_lock); + list_for_each_entry(inc, &ipvs->udp_apps[hash], p_list) { if (inc->port == cp->vport) { if (unlikely(!ip_vs_app_inc_get(inc))) break; - spin_unlock(&udp_app_lock); + spin_unlock(&ipvs->udp_app_lock); IP_VS_DBG_BUF(9, "%s(): Binding conn %s:%u->" "%s:%u to app %s on port %u\n", @@ -429,14 +430,14 @@ static int udp_app_conn_bind(struct ip_vs_conn *cp) goto out; } } - spin_unlock(&udp_app_lock); + spin_unlock(&ipvs->udp_app_lock); out: return result; } -static int udp_timeouts[IP_VS_UDP_S_LAST+1] = { +static const int udp_timeouts[IP_VS_UDP_S_LAST+1] = { [IP_VS_UDP_S_NORMAL] = 5*60*HZ, [IP_VS_UDP_S_LAST] = 2*HZ, }; @@ -446,14 +447,6 @@ static const char *const udp_state_name_table[IP_VS_UDP_S_LAST+1] = { [IP_VS_UDP_S_LAST] = "BUG!", }; - -static int -udp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to) -{ - return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_UDP_S_LAST, - udp_state_name_table, sname, to); -} - static const char * udp_state_name(int state) { if (state >= IP_VS_UDP_S_LAST) @@ -464,20 +457,30 @@ static const char * udp_state_name(int state) static int udp_state_transition(struct ip_vs_conn *cp, int direction, const struct sk_buff *skb, - struct ip_vs_protocol *pp) + struct ip_vs_proto_data *pd) { - cp->timeout = pp->timeout_table[IP_VS_UDP_S_NORMAL]; + if (unlikely(!pd)) { + pr_err("UDP no ns data\n"); + return 0; + } + + cp->timeout = pd->timeout_table[IP_VS_UDP_S_NORMAL]; return 1; } -static void udp_init(struct ip_vs_protocol *pp) +static void __udp_init(struct net *net, struct ip_vs_proto_data *pd) { - IP_VS_INIT_HASH_TABLE(udp_apps); - pp->timeout_table = udp_timeouts; + struct netns_ipvs *ipvs = net_ipvs(net); + + ip_vs_init_hash_table(ipvs->udp_apps, UDP_APP_TAB_SIZE); + spin_lock_init(&ipvs->udp_app_lock); + pd->timeout_table = ip_vs_create_timeout_table((int *)udp_timeouts, + sizeof(udp_timeouts)); } -static void udp_exit(struct ip_vs_protocol *pp) +static void __udp_exit(struct net *net, struct ip_vs_proto_data *pd) { + kfree(pd->timeout_table); } @@ -486,8 +489,10 @@ struct ip_vs_protocol ip_vs_protocol_udp = { .protocol = IPPROTO_UDP, .num_states = IP_VS_UDP_S_LAST, .dont_defrag = 0, - .init = udp_init, - .exit = udp_exit, + .init = NULL, + .exit = NULL, + .init_netns = __udp_init, + .exit_netns = __udp_exit, .conn_schedule = udp_conn_schedule, .conn_in_get = ip_vs_conn_in_get_proto, .conn_out_get = ip_vs_conn_out_get_proto, @@ -501,5 +506,4 @@ struct ip_vs_protocol ip_vs_protocol_udp = { .app_conn_bind = udp_app_conn_bind, .debug_packet = ip_vs_tcpudp_debug_packet, .timeout_change = NULL, - .set_state_timeout = udp_set_state_timeout, }; diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index ab85aedea17..d1adf988eb0 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -5,6 +5,18 @@ * high-performance and highly available server based on a * cluster of servers. * + * Version 1, is capable of handling both version 0 and 1 messages. + * Version 0 is the plain old format. + * Note Version 0 receivers will just drop Ver 1 messages. + * Version 1 is capable of handle IPv6, Persistence data, + * time-outs, and firewall marks. + * In ver.1 "ip_vs_sync_conn_options" will be sent in netw. order. + * Ver. 0 can be turned on by sysctl -w net.ipv4.vs.sync_version=0 + * + * Definitions Message: is a complete datagram + * Sync_conn: is a part of a Message + * Param Data is an option to a Sync_conn. + * * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> * * ip_vs_sync: sync connection info from master load balancer to backups @@ -15,6 +27,8 @@ * Alexandre Cassen : Added SyncID support for incoming sync * messages filtering. * Justin Ossevoort : Fix endian problem on sync message size. + * Hans Schillstrom : Added Version 1: i.e. IPv6, + * Persistence support, fwmark and time-out. */ #define KMSG_COMPONENT "IPVS" @@ -35,6 +49,8 @@ #include <linux/wait.h> #include <linux/kernel.h> +#include <asm/unaligned.h> /* Used for ntoh_seq and hton_seq */ + #include <net/ip.h> #include <net/sock.h> @@ -43,11 +59,13 @@ #define IP_VS_SYNC_GROUP 0xe0000051 /* multicast addr - 224.0.0.81 */ #define IP_VS_SYNC_PORT 8848 /* multicast port */ +#define SYNC_PROTO_VER 1 /* Protocol version in header */ /* * IPVS sync connection entry + * Version 0, i.e. original version. */ -struct ip_vs_sync_conn { +struct ip_vs_sync_conn_v0 { __u8 reserved; /* Protocol, addresses and port numbers */ @@ -71,41 +89,159 @@ struct ip_vs_sync_conn_options { struct ip_vs_seq out_seq; /* outgoing seq. struct */ }; +/* + Sync Connection format (sync_conn) + + 0 1 2 3 + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Type | Protocol | Ver. | Size | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Flags | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | State | cport | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | vport | dport | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | fwmark | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | timeout (in sec.) | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | ... | + | IP-Addresses (v4 or v6) | + | ... | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + Optional Parameters. + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Param. Type | Param. Length | Param. data | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | + | ... | + | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | | Param Type | Param. Length | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Param data | + | Last Param data should be padded for 32 bit alignment | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +*/ + +/* + * Type 0, IPv4 sync connection format + */ +struct ip_vs_sync_v4 { + __u8 type; + __u8 protocol; /* Which protocol (TCP/UDP) */ + __be16 ver_size; /* Version msb 4 bits */ + /* Flags and state transition */ + __be32 flags; /* status flags */ + __be16 state; /* state info */ + /* Protocol, addresses and port numbers */ + __be16 cport; + __be16 vport; + __be16 dport; + __be32 fwmark; /* Firewall mark from skb */ + __be32 timeout; /* cp timeout */ + __be32 caddr; /* client address */ + __be32 vaddr; /* virtual address */ + __be32 daddr; /* destination address */ + /* The sequence options start here */ + /* PE data padded to 32bit alignment after seq. options */ +}; +/* + * Type 2 messages IPv6 + */ +struct ip_vs_sync_v6 { + __u8 type; + __u8 protocol; /* Which protocol (TCP/UDP) */ + __be16 ver_size; /* Version msb 4 bits */ + /* Flags and state transition */ + __be32 flags; /* status flags */ + __be16 state; /* state info */ + /* Protocol, addresses and port numbers */ + __be16 cport; + __be16 vport; + __be16 dport; + __be32 fwmark; /* Firewall mark from skb */ + __be32 timeout; /* cp timeout */ + struct in6_addr caddr; /* client address */ + struct in6_addr vaddr; /* virtual address */ + struct in6_addr daddr; /* destination address */ + /* The sequence options start here */ + /* PE data padded to 32bit alignment after seq. options */ +}; + +union ip_vs_sync_conn { + struct ip_vs_sync_v4 v4; + struct ip_vs_sync_v6 v6; +}; + +/* Bits in Type field in above */ +#define STYPE_INET6 0 +#define STYPE_F_INET6 (1 << STYPE_INET6) + +#define SVER_SHIFT 12 /* Shift to get version */ +#define SVER_MASK 0x0fff /* Mask to strip version */ + +#define IPVS_OPT_SEQ_DATA 1 +#define IPVS_OPT_PE_DATA 2 +#define IPVS_OPT_PE_NAME 3 +#define IPVS_OPT_PARAM 7 + +#define IPVS_OPT_F_SEQ_DATA (1 << (IPVS_OPT_SEQ_DATA-1)) +#define IPVS_OPT_F_PE_DATA (1 << (IPVS_OPT_PE_DATA-1)) +#define IPVS_OPT_F_PE_NAME (1 << (IPVS_OPT_PE_NAME-1)) +#define IPVS_OPT_F_PARAM (1 << (IPVS_OPT_PARAM-1)) + struct ip_vs_sync_thread_data { + struct net *net; struct socket *sock; char *buf; }; -#define SIMPLE_CONN_SIZE (sizeof(struct ip_vs_sync_conn)) +/* Version 0 definition of packet sizes */ +#define SIMPLE_CONN_SIZE (sizeof(struct ip_vs_sync_conn_v0)) #define FULL_CONN_SIZE \ -(sizeof(struct ip_vs_sync_conn) + sizeof(struct ip_vs_sync_conn_options)) +(sizeof(struct ip_vs_sync_conn_v0) + sizeof(struct ip_vs_sync_conn_options)) /* - The master mulitcasts messages to the backup load balancers in the - following format. + The master mulitcasts messages (Datagrams) to the backup load balancers + in the following format. + + Version 1: + Note, first byte should be Zero, so ver 0 receivers will drop the packet. 0 1 2 3 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - | Count Conns | SyncID | Size | + | 0 | SyncID | Size | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Count Conns | Version | Reserved, set to Zero | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | | | IPVS Sync Connection (1) | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | . | - | . | + ~ . ~ | . | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | | | IPVS Sync Connection (n) | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + + Version 0 Header + 0 1 2 3 + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Count Conns | SyncID | Size | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | IPVS Sync Connection (1) | */ #define SYNC_MESG_HEADER_LEN 4 #define MAX_CONNS_PER_SYNCBUFF 255 /* nr_conns in ip_vs_sync_mesg is 8 bit */ -struct ip_vs_sync_mesg { +/* Version 0 header */ +struct ip_vs_sync_mesg_v0 { __u8 nr_conns; __u8 syncid; __u16 size; @@ -113,9 +249,16 @@ struct ip_vs_sync_mesg { /* ip_vs_sync_conn entries start here */ }; -/* the maximum length of sync (sending/receiving) message */ -static int sync_send_mesg_maxlen; -static int sync_recv_mesg_maxlen; +/* Version 1 header */ +struct ip_vs_sync_mesg { + __u8 reserved; /* must be zero */ + __u8 syncid; + __u16 size; + __u8 nr_conns; + __s8 version; /* SYNC_PROTO_VER */ + __u16 spare; + /* ip_vs_sync_conn entries start here */ +}; struct ip_vs_sync_buff { struct list_head list; @@ -127,28 +270,6 @@ struct ip_vs_sync_buff { unsigned char *end; }; - -/* the sync_buff list head and the lock */ -static LIST_HEAD(ip_vs_sync_queue); -static DEFINE_SPINLOCK(ip_vs_sync_lock); - -/* current sync_buff for accepting new conn entries */ -static struct ip_vs_sync_buff *curr_sb = NULL; -static DEFINE_SPINLOCK(curr_sb_lock); - -/* ipvs sync daemon state */ -volatile int ip_vs_sync_state = IP_VS_STATE_NONE; -volatile int ip_vs_master_syncid = 0; -volatile int ip_vs_backup_syncid = 0; - -/* multicast interface name */ -char ip_vs_master_mcast_ifn[IP_VS_IFNAME_MAXLEN]; -char ip_vs_backup_mcast_ifn[IP_VS_IFNAME_MAXLEN]; - -/* sync daemon tasks */ -static struct task_struct *sync_master_thread; -static struct task_struct *sync_backup_thread; - /* multicast addr */ static struct sockaddr_in mcast_addr = { .sin_family = AF_INET, @@ -156,41 +277,71 @@ static struct sockaddr_in mcast_addr = { .sin_addr.s_addr = cpu_to_be32(IP_VS_SYNC_GROUP), }; +/* + * Copy of struct ip_vs_seq + * From unaligned network order to aligned host order + */ +static void ntoh_seq(struct ip_vs_seq *no, struct ip_vs_seq *ho) +{ + ho->init_seq = get_unaligned_be32(&no->init_seq); + ho->delta = get_unaligned_be32(&no->delta); + ho->previous_delta = get_unaligned_be32(&no->previous_delta); +} + +/* + * Copy of struct ip_vs_seq + * From Aligned host order to unaligned network order + */ +static void hton_seq(struct ip_vs_seq *ho, struct ip_vs_seq *no) +{ + put_unaligned_be32(ho->init_seq, &no->init_seq); + put_unaligned_be32(ho->delta, &no->delta); + put_unaligned_be32(ho->previous_delta, &no->previous_delta); +} -static inline struct ip_vs_sync_buff *sb_dequeue(void) +static inline struct ip_vs_sync_buff *sb_dequeue(struct netns_ipvs *ipvs) { struct ip_vs_sync_buff *sb; - spin_lock_bh(&ip_vs_sync_lock); - if (list_empty(&ip_vs_sync_queue)) { + spin_lock_bh(&ipvs->sync_lock); + if (list_empty(&ipvs->sync_queue)) { sb = NULL; } else { - sb = list_entry(ip_vs_sync_queue.next, + sb = list_entry(ipvs->sync_queue.next, struct ip_vs_sync_buff, list); list_del(&sb->list); } - spin_unlock_bh(&ip_vs_sync_lock); + spin_unlock_bh(&ipvs->sync_lock); return sb; } -static inline struct ip_vs_sync_buff * ip_vs_sync_buff_create(void) +/* + * Create a new sync buffer for Version 1 proto. + */ +static inline struct ip_vs_sync_buff * +ip_vs_sync_buff_create(struct netns_ipvs *ipvs) { struct ip_vs_sync_buff *sb; if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC))) return NULL; - if (!(sb->mesg=kmalloc(sync_send_mesg_maxlen, GFP_ATOMIC))) { + sb->mesg = kmalloc(ipvs->send_mesg_maxlen, GFP_ATOMIC); + if (!sb->mesg) { kfree(sb); return NULL; } + sb->mesg->reserved = 0; /* old nr_conns i.e. must be zeo now */ + sb->mesg->version = SYNC_PROTO_VER; + sb->mesg->syncid = ipvs->master_syncid; + sb->mesg->size = sizeof(struct ip_vs_sync_mesg); sb->mesg->nr_conns = 0; - sb->mesg->syncid = ip_vs_master_syncid; - sb->mesg->size = 4; - sb->head = (unsigned char *)sb->mesg + 4; - sb->end = (unsigned char *)sb->mesg + sync_send_mesg_maxlen; + sb->mesg->spare = 0; + sb->head = (unsigned char *)sb->mesg + sizeof(struct ip_vs_sync_mesg); + sb->end = (unsigned char *)sb->mesg + ipvs->send_mesg_maxlen; + sb->firstuse = jiffies; return sb; } @@ -201,14 +352,16 @@ static inline void ip_vs_sync_buff_release(struct ip_vs_sync_buff *sb) kfree(sb); } -static inline void sb_queue_tail(struct ip_vs_sync_buff *sb) +static inline void sb_queue_tail(struct netns_ipvs *ipvs) { - spin_lock(&ip_vs_sync_lock); - if (ip_vs_sync_state & IP_VS_STATE_MASTER) - list_add_tail(&sb->list, &ip_vs_sync_queue); + struct ip_vs_sync_buff *sb = ipvs->sync_buff; + + spin_lock(&ipvs->sync_lock); + if (ipvs->sync_state & IP_VS_STATE_MASTER) + list_add_tail(&sb->list, &ipvs->sync_queue); else ip_vs_sync_buff_release(sb); - spin_unlock(&ip_vs_sync_lock); + spin_unlock(&ipvs->sync_lock); } /* @@ -216,36 +369,101 @@ static inline void sb_queue_tail(struct ip_vs_sync_buff *sb) * than the specified time or the specified time is zero. */ static inline struct ip_vs_sync_buff * -get_curr_sync_buff(unsigned long time) +get_curr_sync_buff(struct netns_ipvs *ipvs, unsigned long time) { struct ip_vs_sync_buff *sb; - spin_lock_bh(&curr_sb_lock); - if (curr_sb && (time == 0 || - time_before(jiffies - curr_sb->firstuse, time))) { - sb = curr_sb; - curr_sb = NULL; + spin_lock_bh(&ipvs->sync_buff_lock); + if (ipvs->sync_buff && (time == 0 || + time_before(jiffies - ipvs->sync_buff->firstuse, time))) { + sb = ipvs->sync_buff; + ipvs->sync_buff = NULL; } else sb = NULL; - spin_unlock_bh(&curr_sb_lock); + spin_unlock_bh(&ipvs->sync_buff_lock); return sb; } +/* + * Switch mode from sending version 0 or 1 + * - must handle sync_buf + */ +void ip_vs_sync_switch_mode(struct net *net, int mode) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + + if (!ipvs->sync_state & IP_VS_STATE_MASTER) + return; + if (mode == ipvs->sysctl_sync_ver || !ipvs->sync_buff) + return; + + spin_lock_bh(&ipvs->sync_buff_lock); + /* Buffer empty ? then let buf_create do the job */ + if (ipvs->sync_buff->mesg->size <= sizeof(struct ip_vs_sync_mesg)) { + kfree(ipvs->sync_buff); + ipvs->sync_buff = NULL; + } else { + spin_lock_bh(&ipvs->sync_lock); + if (ipvs->sync_state & IP_VS_STATE_MASTER) + list_add_tail(&ipvs->sync_buff->list, + &ipvs->sync_queue); + else + ip_vs_sync_buff_release(ipvs->sync_buff); + spin_unlock_bh(&ipvs->sync_lock); + } + spin_unlock_bh(&ipvs->sync_buff_lock); +} /* + * Create a new sync buffer for Version 0 proto. + */ +static inline struct ip_vs_sync_buff * +ip_vs_sync_buff_create_v0(struct netns_ipvs *ipvs) +{ + struct ip_vs_sync_buff *sb; + struct ip_vs_sync_mesg_v0 *mesg; + + if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC))) + return NULL; + + sb->mesg = kmalloc(ipvs->send_mesg_maxlen, GFP_ATOMIC); + if (!sb->mesg) { + kfree(sb); + return NULL; + } + mesg = (struct ip_vs_sync_mesg_v0 *)sb->mesg; + mesg->nr_conns = 0; + mesg->syncid = ipvs->master_syncid; + mesg->size = sizeof(struct ip_vs_sync_mesg_v0); + sb->head = (unsigned char *)mesg + sizeof(struct ip_vs_sync_mesg_v0); + sb->end = (unsigned char *)mesg + ipvs->send_mesg_maxlen; + sb->firstuse = jiffies; + return sb; +} + +/* + * Version 0 , could be switched in by sys_ctl. * Add an ip_vs_conn information into the current sync_buff. - * Called by ip_vs_in. */ -void ip_vs_sync_conn(struct ip_vs_conn *cp) +void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp) { - struct ip_vs_sync_mesg *m; - struct ip_vs_sync_conn *s; + struct netns_ipvs *ipvs = net_ipvs(net); + struct ip_vs_sync_mesg_v0 *m; + struct ip_vs_sync_conn_v0 *s; int len; - spin_lock(&curr_sb_lock); - if (!curr_sb) { - if (!(curr_sb=ip_vs_sync_buff_create())) { - spin_unlock(&curr_sb_lock); + if (unlikely(cp->af != AF_INET)) + return; + /* Do not sync ONE PACKET */ + if (cp->flags & IP_VS_CONN_F_ONE_PACKET) + return; + + spin_lock(&ipvs->sync_buff_lock); + if (!ipvs->sync_buff) { + ipvs->sync_buff = + ip_vs_sync_buff_create_v0(ipvs); + if (!ipvs->sync_buff) { + spin_unlock(&ipvs->sync_buff_lock); pr_err("ip_vs_sync_buff_create failed.\n"); return; } @@ -253,10 +471,11 @@ void ip_vs_sync_conn(struct ip_vs_conn *cp) len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE : SIMPLE_CONN_SIZE; - m = curr_sb->mesg; - s = (struct ip_vs_sync_conn *)curr_sb->head; + m = (struct ip_vs_sync_mesg_v0 *)ipvs->sync_buff->mesg; + s = (struct ip_vs_sync_conn_v0 *)ipvs->sync_buff->head; /* copy members */ + s->reserved = 0; s->protocol = cp->protocol; s->cport = cp->cport; s->vport = cp->vport; @@ -274,83 +493,366 @@ void ip_vs_sync_conn(struct ip_vs_conn *cp) m->nr_conns++; m->size += len; - curr_sb->head += len; + ipvs->sync_buff->head += len; /* check if there is a space for next one */ - if (curr_sb->head+FULL_CONN_SIZE > curr_sb->end) { - sb_queue_tail(curr_sb); - curr_sb = NULL; + if (ipvs->sync_buff->head + FULL_CONN_SIZE > ipvs->sync_buff->end) { + sb_queue_tail(ipvs); + ipvs->sync_buff = NULL; } - spin_unlock(&curr_sb_lock); + spin_unlock(&ipvs->sync_buff_lock); /* synchronize its controller if it has */ if (cp->control) - ip_vs_sync_conn(cp->control); + ip_vs_sync_conn(net, cp->control); +} + +/* + * Add an ip_vs_conn information into the current sync_buff. + * Called by ip_vs_in. + * Sending Version 1 messages + */ +void ip_vs_sync_conn(struct net *net, struct ip_vs_conn *cp) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + struct ip_vs_sync_mesg *m; + union ip_vs_sync_conn *s; + __u8 *p; + unsigned int len, pe_name_len, pad; + + /* Handle old version of the protocol */ + if (ipvs->sysctl_sync_ver == 0) { + ip_vs_sync_conn_v0(net, cp); + return; + } + /* Do not sync ONE PACKET */ + if (cp->flags & IP_VS_CONN_F_ONE_PACKET) + goto control; +sloop: + /* Sanity checks */ + pe_name_len = 0; + if (cp->pe_data_len) { + if (!cp->pe_data || !cp->dest) { + IP_VS_ERR_RL("SYNC, connection pe_data invalid\n"); + return; + } + pe_name_len = strnlen(cp->pe->name, IP_VS_PENAME_MAXLEN); + } + + spin_lock(&ipvs->sync_buff_lock); + +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) + len = sizeof(struct ip_vs_sync_v6); + else +#endif + len = sizeof(struct ip_vs_sync_v4); + + if (cp->flags & IP_VS_CONN_F_SEQ_MASK) + len += sizeof(struct ip_vs_sync_conn_options) + 2; + + if (cp->pe_data_len) + len += cp->pe_data_len + 2; /* + Param hdr field */ + if (pe_name_len) + len += pe_name_len + 2; + + /* check if there is a space for this one */ + pad = 0; + if (ipvs->sync_buff) { + pad = (4 - (size_t)ipvs->sync_buff->head) & 3; + if (ipvs->sync_buff->head + len + pad > ipvs->sync_buff->end) { + sb_queue_tail(ipvs); + ipvs->sync_buff = NULL; + pad = 0; + } + } + + if (!ipvs->sync_buff) { + ipvs->sync_buff = ip_vs_sync_buff_create(ipvs); + if (!ipvs->sync_buff) { + spin_unlock(&ipvs->sync_buff_lock); + pr_err("ip_vs_sync_buff_create failed.\n"); + return; + } + } + + m = ipvs->sync_buff->mesg; + p = ipvs->sync_buff->head; + ipvs->sync_buff->head += pad + len; + m->size += pad + len; + /* Add ev. padding from prev. sync_conn */ + while (pad--) + *(p++) = 0; + + s = (union ip_vs_sync_conn *)p; + + /* Set message type & copy members */ + s->v4.type = (cp->af == AF_INET6 ? STYPE_F_INET6 : 0); + s->v4.ver_size = htons(len & SVER_MASK); /* Version 0 */ + s->v4.flags = htonl(cp->flags & ~IP_VS_CONN_F_HASHED); + s->v4.state = htons(cp->state); + s->v4.protocol = cp->protocol; + s->v4.cport = cp->cport; + s->v4.vport = cp->vport; + s->v4.dport = cp->dport; + s->v4.fwmark = htonl(cp->fwmark); + s->v4.timeout = htonl(cp->timeout / HZ); + m->nr_conns++; + +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) { + p += sizeof(struct ip_vs_sync_v6); + ipv6_addr_copy(&s->v6.caddr, &cp->caddr.in6); + ipv6_addr_copy(&s->v6.vaddr, &cp->vaddr.in6); + ipv6_addr_copy(&s->v6.daddr, &cp->daddr.in6); + } else +#endif + { + p += sizeof(struct ip_vs_sync_v4); /* options ptr */ + s->v4.caddr = cp->caddr.ip; + s->v4.vaddr = cp->vaddr.ip; + s->v4.daddr = cp->daddr.ip; + } + if (cp->flags & IP_VS_CONN_F_SEQ_MASK) { + *(p++) = IPVS_OPT_SEQ_DATA; + *(p++) = sizeof(struct ip_vs_sync_conn_options); + hton_seq((struct ip_vs_seq *)p, &cp->in_seq); + p += sizeof(struct ip_vs_seq); + hton_seq((struct ip_vs_seq *)p, &cp->out_seq); + p += sizeof(struct ip_vs_seq); + } + /* Handle pe data */ + if (cp->pe_data_len && cp->pe_data) { + *(p++) = IPVS_OPT_PE_DATA; + *(p++) = cp->pe_data_len; + memcpy(p, cp->pe_data, cp->pe_data_len); + p += cp->pe_data_len; + if (pe_name_len) { + /* Add PE_NAME */ + *(p++) = IPVS_OPT_PE_NAME; + *(p++) = pe_name_len; + memcpy(p, cp->pe->name, pe_name_len); + p += pe_name_len; + } + } + + spin_unlock(&ipvs->sync_buff_lock); + +control: + /* synchronize its controller if it has */ + cp = cp->control; + if (!cp) + return; + /* + * Reduce sync rate for templates + * i.e only increment in_pkts for Templates. + */ + if (cp->flags & IP_VS_CONN_F_TEMPLATE) { + int pkts = atomic_add_return(1, &cp->in_pkts); + + if (pkts % ipvs->sysctl_sync_threshold[1] != 1) + return; + } + goto sloop; } +/* + * fill_param used by version 1 + */ static inline int -ip_vs_conn_fill_param_sync(int af, int protocol, - const union nf_inet_addr *caddr, __be16 cport, - const union nf_inet_addr *vaddr, __be16 vport, - struct ip_vs_conn_param *p) +ip_vs_conn_fill_param_sync(struct net *net, int af, union ip_vs_sync_conn *sc, + struct ip_vs_conn_param *p, + __u8 *pe_data, unsigned int pe_data_len, + __u8 *pe_name, unsigned int pe_name_len) { - /* XXX: Need to take into account persistence engine */ - ip_vs_conn_fill_param(af, protocol, caddr, cport, vaddr, vport, p); +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + ip_vs_conn_fill_param(net, af, sc->v6.protocol, + (const union nf_inet_addr *)&sc->v6.caddr, + sc->v6.cport, + (const union nf_inet_addr *)&sc->v6.vaddr, + sc->v6.vport, p); + else +#endif + ip_vs_conn_fill_param(net, af, sc->v4.protocol, + (const union nf_inet_addr *)&sc->v4.caddr, + sc->v4.cport, + (const union nf_inet_addr *)&sc->v4.vaddr, + sc->v4.vport, p); + /* Handle pe data */ + if (pe_data_len) { + if (pe_name_len) { + char buff[IP_VS_PENAME_MAXLEN+1]; + + memcpy(buff, pe_name, pe_name_len); + buff[pe_name_len]=0; + p->pe = __ip_vs_pe_getbyname(buff); + if (!p->pe) { + IP_VS_DBG(3, "BACKUP, no %s engine found/loaded\n", + buff); + return 1; + } + } else { + IP_VS_ERR_RL("BACKUP, Invalid PE parameters\n"); + return 1; + } + + p->pe_data = kmalloc(pe_data_len, GFP_ATOMIC); + if (!p->pe_data) { + if (p->pe->module) + module_put(p->pe->module); + return -ENOMEM; + } + memcpy(p->pe_data, pe_data, pe_data_len); + p->pe_data_len = pe_data_len; + } return 0; } /* - * Process received multicast message and create the corresponding - * ip_vs_conn entries. + * Connection Add / Update. + * Common for version 0 and 1 reception of backup sync_conns. + * Param: ... + * timeout is in sec. */ -static void ip_vs_process_message(const char *buffer, const size_t buflen) +static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param, + unsigned int flags, unsigned int state, + unsigned int protocol, unsigned int type, + const union nf_inet_addr *daddr, __be16 dport, + unsigned long timeout, __u32 fwmark, + struct ip_vs_sync_conn_options *opt) { - struct ip_vs_sync_mesg *m = (struct ip_vs_sync_mesg *)buffer; - struct ip_vs_sync_conn *s; - struct ip_vs_sync_conn_options *opt; - struct ip_vs_conn *cp; - struct ip_vs_protocol *pp; struct ip_vs_dest *dest; - struct ip_vs_conn_param param; - char *p; - int i; + struct ip_vs_conn *cp; + struct netns_ipvs *ipvs = net_ipvs(net); - if (buflen < sizeof(struct ip_vs_sync_mesg)) { - IP_VS_ERR_RL("sync message header too short\n"); - return; - } + if (!(flags & IP_VS_CONN_F_TEMPLATE)) + cp = ip_vs_conn_in_get(param); + else + cp = ip_vs_ct_in_get(param); - /* Convert size back to host byte order */ - m->size = ntohs(m->size); + if (cp && param->pe_data) /* Free pe_data */ + kfree(param->pe_data); + if (!cp) { + /* + * Find the appropriate destination for the connection. + * If it is not found the connection will remain unbound + * but still handled. + */ + dest = ip_vs_find_dest(net, type, daddr, dport, param->vaddr, + param->vport, protocol, fwmark); - if (buflen != m->size) { - IP_VS_ERR_RL("bogus sync message size\n"); - return; + /* Set the approprite ativity flag */ + if (protocol == IPPROTO_TCP) { + if (state != IP_VS_TCP_S_ESTABLISHED) + flags |= IP_VS_CONN_F_INACTIVE; + else + flags &= ~IP_VS_CONN_F_INACTIVE; + } else if (protocol == IPPROTO_SCTP) { + if (state != IP_VS_SCTP_S_ESTABLISHED) + flags |= IP_VS_CONN_F_INACTIVE; + else + flags &= ~IP_VS_CONN_F_INACTIVE; + } + cp = ip_vs_conn_new(param, daddr, dport, flags, dest, fwmark); + if (dest) + atomic_dec(&dest->refcnt); + if (!cp) { + if (param->pe_data) + kfree(param->pe_data); + IP_VS_DBG(2, "BACKUP, add new conn. failed\n"); + return; + } + } else if (!cp->dest) { + dest = ip_vs_try_bind_dest(cp); + if (dest) + atomic_dec(&dest->refcnt); + } else if ((cp->dest) && (cp->protocol == IPPROTO_TCP) && + (cp->state != state)) { + /* update active/inactive flag for the connection */ + dest = cp->dest; + if (!(cp->flags & IP_VS_CONN_F_INACTIVE) && + (state != IP_VS_TCP_S_ESTABLISHED)) { + atomic_dec(&dest->activeconns); + atomic_inc(&dest->inactconns); + cp->flags |= IP_VS_CONN_F_INACTIVE; + } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) && + (state == IP_VS_TCP_S_ESTABLISHED)) { + atomic_inc(&dest->activeconns); + atomic_dec(&dest->inactconns); + cp->flags &= ~IP_VS_CONN_F_INACTIVE; + } + } else if ((cp->dest) && (cp->protocol == IPPROTO_SCTP) && + (cp->state != state)) { + dest = cp->dest; + if (!(cp->flags & IP_VS_CONN_F_INACTIVE) && + (state != IP_VS_SCTP_S_ESTABLISHED)) { + atomic_dec(&dest->activeconns); + atomic_inc(&dest->inactconns); + cp->flags &= ~IP_VS_CONN_F_INACTIVE; + } } - /* SyncID sanity check */ - if (ip_vs_backup_syncid != 0 && m->syncid != ip_vs_backup_syncid) { - IP_VS_DBG(7, "Ignoring incoming msg with syncid = %d\n", - m->syncid); - return; + if (opt) + memcpy(&cp->in_seq, opt, sizeof(*opt)); + atomic_set(&cp->in_pkts, ipvs->sysctl_sync_threshold[0]); + cp->state = state; + cp->old_state = cp->state; + /* + * For Ver 0 messages style + * - Not possible to recover the right timeout for templates + * - can not find the right fwmark + * virtual service. If needed, we can do it for + * non-fwmark persistent services. + * Ver 1 messages style. + * - No problem. + */ + if (timeout) { + if (timeout > MAX_SCHEDULE_TIMEOUT / HZ) + timeout = MAX_SCHEDULE_TIMEOUT / HZ; + cp->timeout = timeout*HZ; + } else { + struct ip_vs_proto_data *pd; + + pd = ip_vs_proto_data_get(net, protocol); + if (!(flags & IP_VS_CONN_F_TEMPLATE) && pd && pd->timeout_table) + cp->timeout = pd->timeout_table[state]; + else + cp->timeout = (3*60*HZ); } + ip_vs_conn_put(cp); +} - p = (char *)buffer + sizeof(struct ip_vs_sync_mesg); +/* + * Process received multicast message for Version 0 + */ +static void ip_vs_process_message_v0(struct net *net, const char *buffer, + const size_t buflen) +{ + struct ip_vs_sync_mesg_v0 *m = (struct ip_vs_sync_mesg_v0 *)buffer; + struct ip_vs_sync_conn_v0 *s; + struct ip_vs_sync_conn_options *opt; + struct ip_vs_protocol *pp; + struct ip_vs_conn_param param; + char *p; + int i; + + p = (char *)buffer + sizeof(struct ip_vs_sync_mesg_v0); for (i=0; i<m->nr_conns; i++) { unsigned flags, state; if (p + SIMPLE_CONN_SIZE > buffer+buflen) { - IP_VS_ERR_RL("bogus conn in sync message\n"); + IP_VS_ERR_RL("BACKUP v0, bogus conn\n"); return; } - s = (struct ip_vs_sync_conn *) p; + s = (struct ip_vs_sync_conn_v0 *) p; flags = ntohs(s->flags) | IP_VS_CONN_F_SYNC; flags &= ~IP_VS_CONN_F_HASHED; if (flags & IP_VS_CONN_F_SEQ_MASK) { opt = (struct ip_vs_sync_conn_options *)&s[1]; p += FULL_CONN_SIZE; if (p > buffer+buflen) { - IP_VS_ERR_RL("bogus conn options in sync message\n"); + IP_VS_ERR_RL("BACKUP v0, Dropping buffer bogus conn options\n"); return; } } else { @@ -362,118 +864,286 @@ static void ip_vs_process_message(const char *buffer, const size_t buflen) if (!(flags & IP_VS_CONN_F_TEMPLATE)) { pp = ip_vs_proto_get(s->protocol); if (!pp) { - IP_VS_ERR_RL("Unsupported protocol %u in sync msg\n", + IP_VS_DBG(2, "BACKUP v0, Unsupported protocol %u\n", s->protocol); continue; } if (state >= pp->num_states) { - IP_VS_DBG(2, "Invalid %s state %u in sync msg\n", + IP_VS_DBG(2, "BACKUP v0, Invalid %s state %u\n", pp->name, state); continue; } } else { /* protocol in templates is not used for state/timeout */ - pp = NULL; if (state > 0) { - IP_VS_DBG(2, "Invalid template state %u in sync msg\n", + IP_VS_DBG(2, "BACKUP v0, Invalid template state %u\n", state); state = 0; } } - { - if (ip_vs_conn_fill_param_sync(AF_INET, s->protocol, - (union nf_inet_addr *)&s->caddr, - s->cport, - (union nf_inet_addr *)&s->vaddr, - s->vport, ¶m)) { - pr_err("ip_vs_conn_fill_param_sync failed"); - return; + ip_vs_conn_fill_param(net, AF_INET, s->protocol, + (const union nf_inet_addr *)&s->caddr, + s->cport, + (const union nf_inet_addr *)&s->vaddr, + s->vport, ¶m); + + /* Send timeout as Zero */ + ip_vs_proc_conn(net, ¶m, flags, state, s->protocol, AF_INET, + (union nf_inet_addr *)&s->daddr, s->dport, + 0, 0, opt); + } +} + +/* + * Handle options + */ +static inline int ip_vs_proc_seqopt(__u8 *p, unsigned int plen, + __u32 *opt_flags, + struct ip_vs_sync_conn_options *opt) +{ + struct ip_vs_sync_conn_options *topt; + + topt = (struct ip_vs_sync_conn_options *)p; + + if (plen != sizeof(struct ip_vs_sync_conn_options)) { + IP_VS_DBG(2, "BACKUP, bogus conn options length\n"); + return -EINVAL; + } + if (*opt_flags & IPVS_OPT_F_SEQ_DATA) { + IP_VS_DBG(2, "BACKUP, conn options found twice\n"); + return -EINVAL; + } + ntoh_seq(&topt->in_seq, &opt->in_seq); + ntoh_seq(&topt->out_seq, &opt->out_seq); + *opt_flags |= IPVS_OPT_F_SEQ_DATA; + return 0; +} + +static int ip_vs_proc_str(__u8 *p, unsigned int plen, unsigned int *data_len, + __u8 **data, unsigned int maxlen, + __u32 *opt_flags, __u32 flag) +{ + if (plen > maxlen) { + IP_VS_DBG(2, "BACKUP, bogus par.data len > %d\n", maxlen); + return -EINVAL; + } + if (*opt_flags & flag) { + IP_VS_DBG(2, "BACKUP, Par.data found twice 0x%x\n", flag); + return -EINVAL; + } + *data_len = plen; + *data = p; + *opt_flags |= flag; + return 0; +} +/* + * Process a Version 1 sync. connection + */ +static inline int ip_vs_proc_sync_conn(struct net *net, __u8 *p, __u8 *msg_end) +{ + struct ip_vs_sync_conn_options opt; + union ip_vs_sync_conn *s; + struct ip_vs_protocol *pp; + struct ip_vs_conn_param param; + __u32 flags; + unsigned int af, state, pe_data_len=0, pe_name_len=0; + __u8 *pe_data=NULL, *pe_name=NULL; + __u32 opt_flags=0; + int retc=0; + + s = (union ip_vs_sync_conn *) p; + + if (s->v6.type & STYPE_F_INET6) { +#ifdef CONFIG_IP_VS_IPV6 + af = AF_INET6; + p += sizeof(struct ip_vs_sync_v6); +#else + IP_VS_DBG(3,"BACKUP, IPv6 msg received, and IPVS is not compiled for IPv6\n"); + retc = 10; + goto out; +#endif + } else if (!s->v4.type) { + af = AF_INET; + p += sizeof(struct ip_vs_sync_v4); + } else { + return -10; + } + if (p > msg_end) + return -20; + + /* Process optional params check Type & Len. */ + while (p < msg_end) { + int ptype; + int plen; + + if (p+2 > msg_end) + return -30; + ptype = *(p++); + plen = *(p++); + + if (!plen || ((p + plen) > msg_end)) + return -40; + /* Handle seq option p = param data */ + switch (ptype & ~IPVS_OPT_F_PARAM) { + case IPVS_OPT_SEQ_DATA: + if (ip_vs_proc_seqopt(p, plen, &opt_flags, &opt)) + return -50; + break; + + case IPVS_OPT_PE_DATA: + if (ip_vs_proc_str(p, plen, &pe_data_len, &pe_data, + IP_VS_PEDATA_MAXLEN, &opt_flags, + IPVS_OPT_F_PE_DATA)) + return -60; + break; + + case IPVS_OPT_PE_NAME: + if (ip_vs_proc_str(p, plen,&pe_name_len, &pe_name, + IP_VS_PENAME_MAXLEN, &opt_flags, + IPVS_OPT_F_PE_NAME)) + return -70; + break; + + default: + /* Param data mandatory ? */ + if (!(ptype & IPVS_OPT_F_PARAM)) { + IP_VS_DBG(3, "BACKUP, Unknown mandatory param %d found\n", + ptype & ~IPVS_OPT_F_PARAM); + retc = 20; + goto out; } - if (!(flags & IP_VS_CONN_F_TEMPLATE)) - cp = ip_vs_conn_in_get(¶m); - else - cp = ip_vs_ct_in_get(¶m); } - if (!cp) { - /* - * Find the appropriate destination for the connection. - * If it is not found the connection will remain unbound - * but still handled. - */ - dest = ip_vs_find_dest(AF_INET, - (union nf_inet_addr *)&s->daddr, - s->dport, - (union nf_inet_addr *)&s->vaddr, - s->vport, - s->protocol); - /* Set the approprite ativity flag */ - if (s->protocol == IPPROTO_TCP) { - if (state != IP_VS_TCP_S_ESTABLISHED) - flags |= IP_VS_CONN_F_INACTIVE; - else - flags &= ~IP_VS_CONN_F_INACTIVE; - } else if (s->protocol == IPPROTO_SCTP) { - if (state != IP_VS_SCTP_S_ESTABLISHED) - flags |= IP_VS_CONN_F_INACTIVE; - else - flags &= ~IP_VS_CONN_F_INACTIVE; + p += plen; /* Next option */ + } + + /* Get flags and Mask off unsupported */ + flags = ntohl(s->v4.flags) & IP_VS_CONN_F_BACKUP_MASK; + flags |= IP_VS_CONN_F_SYNC; + state = ntohs(s->v4.state); + + if (!(flags & IP_VS_CONN_F_TEMPLATE)) { + pp = ip_vs_proto_get(s->v4.protocol); + if (!pp) { + IP_VS_DBG(3,"BACKUP, Unsupported protocol %u\n", + s->v4.protocol); + retc = 30; + goto out; + } + if (state >= pp->num_states) { + IP_VS_DBG(3, "BACKUP, Invalid %s state %u\n", + pp->name, state); + retc = 40; + goto out; + } + } else { + /* protocol in templates is not used for state/timeout */ + if (state > 0) { + IP_VS_DBG(3, "BACKUP, Invalid template state %u\n", + state); + state = 0; + } + } + if (ip_vs_conn_fill_param_sync(net, af, s, ¶m, pe_data, + pe_data_len, pe_name, pe_name_len)) { + retc = 50; + goto out; + } + /* If only IPv4, just silent skip IPv6 */ + if (af == AF_INET) + ip_vs_proc_conn(net, ¶m, flags, state, s->v4.protocol, af, + (union nf_inet_addr *)&s->v4.daddr, s->v4.dport, + ntohl(s->v4.timeout), ntohl(s->v4.fwmark), + (opt_flags & IPVS_OPT_F_SEQ_DATA ? &opt : NULL) + ); +#ifdef CONFIG_IP_VS_IPV6 + else + ip_vs_proc_conn(net, ¶m, flags, state, s->v6.protocol, af, + (union nf_inet_addr *)&s->v6.daddr, s->v6.dport, + ntohl(s->v6.timeout), ntohl(s->v6.fwmark), + (opt_flags & IPVS_OPT_F_SEQ_DATA ? &opt : NULL) + ); +#endif + return 0; + /* Error exit */ +out: + IP_VS_DBG(2, "BACKUP, Single msg dropped err:%d\n", retc); + return retc; + +} +/* + * Process received multicast message and create the corresponding + * ip_vs_conn entries. + * Handles Version 0 & 1 + */ +static void ip_vs_process_message(struct net *net, __u8 *buffer, + const size_t buflen) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + struct ip_vs_sync_mesg *m2 = (struct ip_vs_sync_mesg *)buffer; + __u8 *p, *msg_end; + int i, nr_conns; + + if (buflen < sizeof(struct ip_vs_sync_mesg_v0)) { + IP_VS_DBG(2, "BACKUP, message header too short\n"); + return; + } + /* Convert size back to host byte order */ + m2->size = ntohs(m2->size); + + if (buflen != m2->size) { + IP_VS_DBG(2, "BACKUP, bogus message size\n"); + return; + } + /* SyncID sanity check */ + if (ipvs->backup_syncid != 0 && m2->syncid != ipvs->backup_syncid) { + IP_VS_DBG(7, "BACKUP, Ignoring syncid = %d\n", m2->syncid); + return; + } + /* Handle version 1 message */ + if ((m2->version == SYNC_PROTO_VER) && (m2->reserved == 0) + && (m2->spare == 0)) { + + msg_end = buffer + sizeof(struct ip_vs_sync_mesg); + nr_conns = m2->nr_conns; + + for (i=0; i<nr_conns; i++) { + union ip_vs_sync_conn *s; + unsigned size; + int retc; + + p = msg_end; + if (p + sizeof(s->v4) > buffer+buflen) { + IP_VS_ERR_RL("BACKUP, Dropping buffer, to small\n"); + return; } - cp = ip_vs_conn_new(¶m, - (union nf_inet_addr *)&s->daddr, - s->dport, flags, dest); - if (dest) - atomic_dec(&dest->refcnt); - if (!cp) { - pr_err("ip_vs_conn_new failed\n"); + s = (union ip_vs_sync_conn *)p; + size = ntohs(s->v4.ver_size) & SVER_MASK; + msg_end = p + size; + /* Basic sanity checks */ + if (msg_end > buffer+buflen) { + IP_VS_ERR_RL("BACKUP, Dropping buffer, msg > buffer\n"); return; } - } else if (!cp->dest) { - dest = ip_vs_try_bind_dest(cp); - if (dest) - atomic_dec(&dest->refcnt); - } else if ((cp->dest) && (cp->protocol == IPPROTO_TCP) && - (cp->state != state)) { - /* update active/inactive flag for the connection */ - dest = cp->dest; - if (!(cp->flags & IP_VS_CONN_F_INACTIVE) && - (state != IP_VS_TCP_S_ESTABLISHED)) { - atomic_dec(&dest->activeconns); - atomic_inc(&dest->inactconns); - cp->flags |= IP_VS_CONN_F_INACTIVE; - } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) && - (state == IP_VS_TCP_S_ESTABLISHED)) { - atomic_inc(&dest->activeconns); - atomic_dec(&dest->inactconns); - cp->flags &= ~IP_VS_CONN_F_INACTIVE; + if (ntohs(s->v4.ver_size) >> SVER_SHIFT) { + IP_VS_ERR_RL("BACKUP, Dropping buffer, Unknown version %d\n", + ntohs(s->v4.ver_size) >> SVER_SHIFT); + return; } - } else if ((cp->dest) && (cp->protocol == IPPROTO_SCTP) && - (cp->state != state)) { - dest = cp->dest; - if (!(cp->flags & IP_VS_CONN_F_INACTIVE) && - (state != IP_VS_SCTP_S_ESTABLISHED)) { - atomic_dec(&dest->activeconns); - atomic_inc(&dest->inactconns); - cp->flags &= ~IP_VS_CONN_F_INACTIVE; + /* Process a single sync_conn */ + retc = ip_vs_proc_sync_conn(net, p, msg_end); + if (retc < 0) { + IP_VS_ERR_RL("BACKUP, Dropping buffer, Err: %d in decoding\n", + retc); + return; } + /* Make sure we have 32 bit alignment */ + msg_end = p + ((size + 3) & ~3); } - - if (opt) - memcpy(&cp->in_seq, opt, sizeof(*opt)); - atomic_set(&cp->in_pkts, sysctl_ip_vs_sync_threshold[0]); - cp->state = state; - cp->old_state = cp->state; - /* - * We can not recover the right timeout for templates - * in all cases, we can not find the right fwmark - * virtual service. If needed, we can do it for - * non-fwmark persistent services. - */ - if (!(flags & IP_VS_CONN_F_TEMPLATE) && pp->timeout_table) - cp->timeout = pp->timeout_table[state]; - else - cp->timeout = (3*60*HZ); - ip_vs_conn_put(cp); + } else { + /* Old type of message */ + ip_vs_process_message_v0(net, buffer, buflen); + return; } } @@ -511,8 +1181,10 @@ static int set_mcast_if(struct sock *sk, char *ifname) { struct net_device *dev; struct inet_sock *inet = inet_sk(sk); + struct net *net = sock_net(sk); - if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL) + dev = __dev_get_by_name(net, ifname); + if (!dev) return -ENODEV; if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if) @@ -531,30 +1203,33 @@ static int set_mcast_if(struct sock *sk, char *ifname) * Set the maximum length of sync message according to the * specified interface's MTU. */ -static int set_sync_mesg_maxlen(int sync_state) +static int set_sync_mesg_maxlen(struct net *net, int sync_state) { + struct netns_ipvs *ipvs = net_ipvs(net); struct net_device *dev; int num; if (sync_state == IP_VS_STATE_MASTER) { - if ((dev = __dev_get_by_name(&init_net, ip_vs_master_mcast_ifn)) == NULL) + dev = __dev_get_by_name(net, ipvs->master_mcast_ifn); + if (!dev) return -ENODEV; num = (dev->mtu - sizeof(struct iphdr) - sizeof(struct udphdr) - SYNC_MESG_HEADER_LEN - 20) / SIMPLE_CONN_SIZE; - sync_send_mesg_maxlen = SYNC_MESG_HEADER_LEN + + ipvs->send_mesg_maxlen = SYNC_MESG_HEADER_LEN + SIMPLE_CONN_SIZE * min(num, MAX_CONNS_PER_SYNCBUFF); IP_VS_DBG(7, "setting the maximum length of sync sending " - "message %d.\n", sync_send_mesg_maxlen); + "message %d.\n", ipvs->send_mesg_maxlen); } else if (sync_state == IP_VS_STATE_BACKUP) { - if ((dev = __dev_get_by_name(&init_net, ip_vs_backup_mcast_ifn)) == NULL) + dev = __dev_get_by_name(net, ipvs->backup_mcast_ifn); + if (!dev) return -ENODEV; - sync_recv_mesg_maxlen = dev->mtu - + ipvs->recv_mesg_maxlen = dev->mtu - sizeof(struct iphdr) - sizeof(struct udphdr); IP_VS_DBG(7, "setting the maximum length of sync receiving " - "message %d.\n", sync_recv_mesg_maxlen); + "message %d.\n", ipvs->recv_mesg_maxlen); } return 0; @@ -569,6 +1244,7 @@ static int set_sync_mesg_maxlen(int sync_state) static int join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname) { + struct net *net = sock_net(sk); struct ip_mreqn mreq; struct net_device *dev; int ret; @@ -576,7 +1252,8 @@ join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname) memset(&mreq, 0, sizeof(mreq)); memcpy(&mreq.imr_multiaddr, addr, sizeof(struct in_addr)); - if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL) + dev = __dev_get_by_name(net, ifname); + if (!dev) return -ENODEV; if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if) return -EINVAL; @@ -593,11 +1270,13 @@ join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname) static int bind_mcastif_addr(struct socket *sock, char *ifname) { + struct net *net = sock_net(sock->sk); struct net_device *dev; __be32 addr; struct sockaddr_in sin; - if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL) + dev = __dev_get_by_name(net, ifname); + if (!dev) return -ENODEV; addr = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE); @@ -619,8 +1298,9 @@ static int bind_mcastif_addr(struct socket *sock, char *ifname) /* * Set up sending multicast socket over UDP */ -static struct socket * make_send_sock(void) +static struct socket *make_send_sock(struct net *net) { + struct netns_ipvs *ipvs = net_ipvs(net); struct socket *sock; int result; @@ -631,7 +1311,7 @@ static struct socket * make_send_sock(void) return ERR_PTR(result); } - result = set_mcast_if(sock->sk, ip_vs_master_mcast_ifn); + result = set_mcast_if(sock->sk, ipvs->master_mcast_ifn); if (result < 0) { pr_err("Error setting outbound mcast interface\n"); goto error; @@ -640,7 +1320,7 @@ static struct socket * make_send_sock(void) set_mcast_loop(sock->sk, 0); set_mcast_ttl(sock->sk, 1); - result = bind_mcastif_addr(sock, ip_vs_master_mcast_ifn); + result = bind_mcastif_addr(sock, ipvs->master_mcast_ifn); if (result < 0) { pr_err("Error binding address of the mcast interface\n"); goto error; @@ -664,8 +1344,9 @@ static struct socket * make_send_sock(void) /* * Set up receiving multicast socket over UDP */ -static struct socket * make_receive_sock(void) +static struct socket *make_receive_sock(struct net *net) { + struct netns_ipvs *ipvs = net_ipvs(net); struct socket *sock; int result; @@ -689,7 +1370,7 @@ static struct socket * make_receive_sock(void) /* join the multicast group */ result = join_mcast_group(sock->sk, (struct in_addr *) &mcast_addr.sin_addr, - ip_vs_backup_mcast_ifn); + ipvs->backup_mcast_ifn); if (result < 0) { pr_err("Error joining to the multicast group\n"); goto error; @@ -760,20 +1441,21 @@ ip_vs_receive(struct socket *sock, char *buffer, const size_t buflen) static int sync_thread_master(void *data) { struct ip_vs_sync_thread_data *tinfo = data; + struct netns_ipvs *ipvs = net_ipvs(tinfo->net); struct ip_vs_sync_buff *sb; pr_info("sync thread started: state = MASTER, mcast_ifn = %s, " "syncid = %d\n", - ip_vs_master_mcast_ifn, ip_vs_master_syncid); + ipvs->master_mcast_ifn, ipvs->master_syncid); while (!kthread_should_stop()) { - while ((sb = sb_dequeue())) { + while ((sb = sb_dequeue(ipvs))) { ip_vs_send_sync_msg(tinfo->sock, sb->mesg); ip_vs_sync_buff_release(sb); } - /* check if entries stay in curr_sb for 2 seconds */ - sb = get_curr_sync_buff(2 * HZ); + /* check if entries stay in ipvs->sync_buff for 2 seconds */ + sb = get_curr_sync_buff(ipvs, 2 * HZ); if (sb) { ip_vs_send_sync_msg(tinfo->sock, sb->mesg); ip_vs_sync_buff_release(sb); @@ -783,14 +1465,13 @@ static int sync_thread_master(void *data) } /* clean up the sync_buff queue */ - while ((sb=sb_dequeue())) { + while ((sb = sb_dequeue(ipvs))) ip_vs_sync_buff_release(sb); - } /* clean up the current sync_buff */ - if ((sb = get_curr_sync_buff(0))) { + sb = get_curr_sync_buff(ipvs, 0); + if (sb) ip_vs_sync_buff_release(sb); - } /* release the sending multicast socket */ sock_release(tinfo->sock); @@ -803,11 +1484,12 @@ static int sync_thread_master(void *data) static int sync_thread_backup(void *data) { struct ip_vs_sync_thread_data *tinfo = data; + struct netns_ipvs *ipvs = net_ipvs(tinfo->net); int len; pr_info("sync thread started: state = BACKUP, mcast_ifn = %s, " "syncid = %d\n", - ip_vs_backup_mcast_ifn, ip_vs_backup_syncid); + ipvs->backup_mcast_ifn, ipvs->backup_syncid); while (!kthread_should_stop()) { wait_event_interruptible(*sk_sleep(tinfo->sock->sk), @@ -817,7 +1499,7 @@ static int sync_thread_backup(void *data) /* do we have data now? */ while (!skb_queue_empty(&(tinfo->sock->sk->sk_receive_queue))) { len = ip_vs_receive(tinfo->sock, tinfo->buf, - sync_recv_mesg_maxlen); + ipvs->recv_mesg_maxlen); if (len <= 0) { pr_err("receiving message error\n"); break; @@ -826,7 +1508,7 @@ static int sync_thread_backup(void *data) /* disable bottom half, because it accesses the data shared by softirq while getting/creating conns */ local_bh_disable(); - ip_vs_process_message(tinfo->buf, len); + ip_vs_process_message(tinfo->net, tinfo->buf, len); local_bh_enable(); } } @@ -840,41 +1522,42 @@ static int sync_thread_backup(void *data) } -int start_sync_thread(int state, char *mcast_ifn, __u8 syncid) +int start_sync_thread(struct net *net, int state, char *mcast_ifn, __u8 syncid) { struct ip_vs_sync_thread_data *tinfo; struct task_struct **realtask, *task; struct socket *sock; + struct netns_ipvs *ipvs = net_ipvs(net); char *name, *buf = NULL; int (*threadfn)(void *data); int result = -ENOMEM; IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current)); IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %Zd bytes\n", - sizeof(struct ip_vs_sync_conn)); + sizeof(struct ip_vs_sync_conn_v0)); if (state == IP_VS_STATE_MASTER) { - if (sync_master_thread) + if (ipvs->master_thread) return -EEXIST; - strlcpy(ip_vs_master_mcast_ifn, mcast_ifn, - sizeof(ip_vs_master_mcast_ifn)); - ip_vs_master_syncid = syncid; - realtask = &sync_master_thread; - name = "ipvs_syncmaster"; + strlcpy(ipvs->master_mcast_ifn, mcast_ifn, + sizeof(ipvs->master_mcast_ifn)); + ipvs->master_syncid = syncid; + realtask = &ipvs->master_thread; + name = "ipvs_master:%d"; threadfn = sync_thread_master; - sock = make_send_sock(); + sock = make_send_sock(net); } else if (state == IP_VS_STATE_BACKUP) { - if (sync_backup_thread) + if (ipvs->backup_thread) return -EEXIST; - strlcpy(ip_vs_backup_mcast_ifn, mcast_ifn, - sizeof(ip_vs_backup_mcast_ifn)); - ip_vs_backup_syncid = syncid; - realtask = &sync_backup_thread; - name = "ipvs_syncbackup"; + strlcpy(ipvs->backup_mcast_ifn, mcast_ifn, + sizeof(ipvs->backup_mcast_ifn)); + ipvs->backup_syncid = syncid; + realtask = &ipvs->backup_thread; + name = "ipvs_backup:%d"; threadfn = sync_thread_backup; - sock = make_receive_sock(); + sock = make_receive_sock(net); } else { return -EINVAL; } @@ -884,9 +1567,9 @@ int start_sync_thread(int state, char *mcast_ifn, __u8 syncid) goto out; } - set_sync_mesg_maxlen(state); + set_sync_mesg_maxlen(net, state); if (state == IP_VS_STATE_BACKUP) { - buf = kmalloc(sync_recv_mesg_maxlen, GFP_KERNEL); + buf = kmalloc(ipvs->recv_mesg_maxlen, GFP_KERNEL); if (!buf) goto outsocket; } @@ -895,10 +1578,11 @@ int start_sync_thread(int state, char *mcast_ifn, __u8 syncid) if (!tinfo) goto outbuf; + tinfo->net = net; tinfo->sock = sock; tinfo->buf = buf; - task = kthread_run(threadfn, tinfo, name); + task = kthread_run(threadfn, tinfo, name, ipvs->gen); if (IS_ERR(task)) { result = PTR_ERR(task); goto outtinfo; @@ -906,7 +1590,7 @@ int start_sync_thread(int state, char *mcast_ifn, __u8 syncid) /* mark as active */ *realtask = task; - ip_vs_sync_state |= state; + ipvs->sync_state |= state; /* increase the module use count */ ip_vs_use_count_inc(); @@ -924,16 +1608,18 @@ out: } -int stop_sync_thread(int state) +int stop_sync_thread(struct net *net, int state) { + struct netns_ipvs *ipvs = net_ipvs(net); + IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current)); if (state == IP_VS_STATE_MASTER) { - if (!sync_master_thread) + if (!ipvs->master_thread) return -ESRCH; pr_info("stopping master sync thread %d ...\n", - task_pid_nr(sync_master_thread)); + task_pid_nr(ipvs->master_thread)); /* * The lock synchronizes with sb_queue_tail(), so that we don't @@ -941,21 +1627,21 @@ int stop_sync_thread(int state) * progress of stopping the master sync daemon. */ - spin_lock_bh(&ip_vs_sync_lock); - ip_vs_sync_state &= ~IP_VS_STATE_MASTER; - spin_unlock_bh(&ip_vs_sync_lock); - kthread_stop(sync_master_thread); - sync_master_thread = NULL; + spin_lock_bh(&ipvs->sync_lock); + ipvs->sync_state &= ~IP_VS_STATE_MASTER; + spin_unlock_bh(&ipvs->sync_lock); + kthread_stop(ipvs->master_thread); + ipvs->master_thread = NULL; } else if (state == IP_VS_STATE_BACKUP) { - if (!sync_backup_thread) + if (!ipvs->backup_thread) return -ESRCH; pr_info("stopping backup sync thread %d ...\n", - task_pid_nr(sync_backup_thread)); + task_pid_nr(ipvs->backup_thread)); - ip_vs_sync_state &= ~IP_VS_STATE_BACKUP; - kthread_stop(sync_backup_thread); - sync_backup_thread = NULL; + ipvs->sync_state &= ~IP_VS_STATE_BACKUP; + kthread_stop(ipvs->backup_thread); + ipvs->backup_thread = NULL; } else { return -EINVAL; } @@ -965,3 +1651,42 @@ int stop_sync_thread(int state) return 0; } + +/* + * Initialize data struct for each netns + */ +static int __net_init __ip_vs_sync_init(struct net *net) +{ + struct netns_ipvs *ipvs = net_ipvs(net); + + INIT_LIST_HEAD(&ipvs->sync_queue); + spin_lock_init(&ipvs->sync_lock); + spin_lock_init(&ipvs->sync_buff_lock); + + ipvs->sync_mcast_addr.sin_family = AF_INET; + ipvs->sync_mcast_addr.sin_port = cpu_to_be16(IP_VS_SYNC_PORT); + ipvs->sync_mcast_addr.sin_addr.s_addr = cpu_to_be32(IP_VS_SYNC_GROUP); + return 0; +} + +static void __ip_vs_sync_cleanup(struct net *net) +{ + stop_sync_thread(net, IP_VS_STATE_MASTER); + stop_sync_thread(net, IP_VS_STATE_BACKUP); +} + +static struct pernet_operations ipvs_sync_ops = { + .init = __ip_vs_sync_init, + .exit = __ip_vs_sync_cleanup, +}; + + +int __init ip_vs_sync_init(void) +{ + return register_pernet_subsys(&ipvs_sync_ops); +} + +void __exit ip_vs_sync_cleanup(void) +{ + unregister_pernet_subsys(&ipvs_sync_ops); +} diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 5325a3fbe4a..1f2a4e35fb1 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -175,7 +175,6 @@ __ip_vs_reroute_locally(struct sk_buff *skb) .fl4_tos = RT_TOS(iph->tos), .mark = skb->mark, }; - struct rtable *rt; if (ip_route_output_key(net, &rt, &fl)) return 0; @@ -390,7 +389,8 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, /* MTU checking */ mtu = dst_mtu(&rt->dst); - if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) { + if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF)) && + !skb_is_gso(skb)) { ip_rt_put(rt); icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); IP_VS_DBG_RL("%s(): frag needed\n", __func__); @@ -443,7 +443,7 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, /* MTU checking */ mtu = dst_mtu(&rt->dst); - if (skb->len > mtu) { + if (skb->len > mtu && !skb_is_gso(skb)) { if (!skb->dev) { struct net *net = dev_net(skb_dst(skb)->dev); @@ -543,7 +543,8 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, /* MTU checking */ mtu = dst_mtu(&rt->dst); - if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) { + if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF)) && + !skb_is_gso(skb)) { icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); IP_VS_DBG_RL_PKT(0, AF_INET, pp, skb, 0, "ip_vs_nat_xmit(): frag needed for"); @@ -658,7 +659,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, /* MTU checking */ mtu = dst_mtu(&rt->dst); - if (skb->len > mtu) { + if (skb->len > mtu && !skb_is_gso(skb)) { if (!skb->dev) { struct net *net = dev_net(skb_dst(skb)->dev); @@ -773,8 +774,8 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, df |= (old_iph->frag_off & htons(IP_DF)); - if ((old_iph->frag_off & htons(IP_DF)) - && mtu < ntohs(old_iph->tot_len)) { + if ((old_iph->frag_off & htons(IP_DF) && + mtu < ntohs(old_iph->tot_len) && !skb_is_gso(skb))) { icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); IP_VS_DBG_RL("%s(): frag needed\n", __func__); goto tx_error_put; @@ -886,7 +887,8 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, if (skb_dst(skb)) skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu); - if (mtu < ntohs(old_iph->payload_len) + sizeof(struct ipv6hdr)) { + if (mtu < ntohs(old_iph->payload_len) + sizeof(struct ipv6hdr) && + !skb_is_gso(skb)) { if (!skb->dev) { struct net *net = dev_net(skb_dst(skb)->dev); @@ -991,7 +993,8 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, /* MTU checking */ mtu = dst_mtu(&rt->dst); - if ((iph->frag_off & htons(IP_DF)) && skb->len > mtu) { + if ((iph->frag_off & htons(IP_DF)) && skb->len > mtu && + !skb_is_gso(skb)) { icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); ip_rt_put(rt); IP_VS_DBG_RL("%s(): frag needed\n", __func__); @@ -1158,7 +1161,8 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, /* MTU checking */ mtu = dst_mtu(&rt->dst); - if ((skb->len > mtu) && (ip_hdr(skb)->frag_off & htons(IP_DF))) { + if ((skb->len > mtu) && (ip_hdr(skb)->frag_off & htons(IP_DF)) && + !skb_is_gso(skb)) { icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); IP_VS_DBG_RL("%s(): frag needed\n", __func__); goto tx_error_put; @@ -1272,7 +1276,7 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, /* MTU checking */ mtu = dst_mtu(&rt->dst); - if (skb->len > mtu) { + if (skb->len > mtu && !skb_is_gso(skb)) { if (!skb->dev) { struct net *net = dev_net(skb_dst(skb)->dev); diff --git a/net/netfilter/nf_conntrack_broadcast.c b/net/netfilter/nf_conntrack_broadcast.c new file mode 100644 index 00000000000..4e99cca6161 --- /dev/null +++ b/net/netfilter/nf_conntrack_broadcast.c @@ -0,0 +1,82 @@ +/* + * broadcast connection tracking helper + * + * (c) 2005 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/module.h> +#include <linux/ip.h> +#include <net/route.h> +#include <linux/inetdevice.h> +#include <linux/skbuff.h> + +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_helper.h> +#include <net/netfilter/nf_conntrack_expect.h> + +int nf_conntrack_broadcast_help(struct sk_buff *skb, + unsigned int protoff, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned int timeout) +{ + struct nf_conntrack_expect *exp; + struct iphdr *iph = ip_hdr(skb); + struct rtable *rt = skb_rtable(skb); + struct in_device *in_dev; + struct nf_conn_help *help = nfct_help(ct); + __be32 mask = 0; + + /* we're only interested in locally generated packets */ + if (skb->sk == NULL) + goto out; + if (rt == NULL || !(rt->rt_flags & RTCF_BROADCAST)) + goto out; + if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) + goto out; + + rcu_read_lock(); + in_dev = __in_dev_get_rcu(rt->dst.dev); + if (in_dev != NULL) { + for_primary_ifa(in_dev) { + if (ifa->ifa_broadcast == iph->daddr) { + mask = ifa->ifa_mask; + break; + } + } endfor_ifa(in_dev); + } + rcu_read_unlock(); + + if (mask == 0) + goto out; + + exp = nf_ct_expect_alloc(ct); + if (exp == NULL) + goto out; + + exp->tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple; + exp->tuple.src.u.udp.port = help->helper->tuple.src.u.udp.port; + + exp->mask.src.u3.ip = mask; + exp->mask.src.u.udp.port = htons(0xFFFF); + + exp->expectfn = NULL; + exp->flags = NF_CT_EXPECT_PERMANENT; + exp->class = NF_CT_EXPECT_CLASS_DEFAULT; + exp->helper = NULL; + + nf_ct_expect_related(exp); + nf_ct_expect_put(exp); + + nf_ct_refresh(ct, skb, timeout * HZ); +out: + return NF_ACCEPT; +} +EXPORT_SYMBOL_GPL(nf_conntrack_broadcast_help); + +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index e61511929c6..1909311c392 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -43,6 +43,7 @@ #include <net/netfilter/nf_conntrack_acct.h> #include <net/netfilter/nf_conntrack_ecache.h> #include <net/netfilter/nf_conntrack_zones.h> +#include <net/netfilter/nf_conntrack_timestamp.h> #include <net/netfilter/nf_nat.h> #include <net/netfilter/nf_nat_core.h> @@ -282,6 +283,11 @@ EXPORT_SYMBOL_GPL(nf_ct_insert_dying_list); static void death_by_timeout(unsigned long ul_conntrack) { struct nf_conn *ct = (void *)ul_conntrack; + struct nf_conn_tstamp *tstamp; + + tstamp = nf_conn_tstamp_find(ct); + if (tstamp && tstamp->stop == 0) + tstamp->stop = ktime_to_ns(ktime_get_real()); if (!test_bit(IPS_DYING_BIT, &ct->status) && unlikely(nf_conntrack_event(IPCT_DESTROY, ct) < 0)) { @@ -419,6 +425,7 @@ __nf_conntrack_confirm(struct sk_buff *skb) struct nf_conntrack_tuple_hash *h; struct nf_conn *ct; struct nf_conn_help *help; + struct nf_conn_tstamp *tstamp; struct hlist_nulls_node *n; enum ip_conntrack_info ctinfo; struct net *net; @@ -486,8 +493,16 @@ __nf_conntrack_confirm(struct sk_buff *skb) ct->timeout.expires += jiffies; add_timer(&ct->timeout); atomic_inc(&ct->ct_general.use); - set_bit(IPS_CONFIRMED_BIT, &ct->status); + ct->status |= IPS_CONFIRMED; + + /* set conntrack timestamp, if enabled. */ + tstamp = nf_conn_tstamp_find(ct); + if (tstamp) { + if (skb->tstamp.tv64 == 0) + __net_timestamp((struct sk_buff *)skb); + tstamp->start = ktime_to_ns(skb->tstamp); + } /* Since the lookup is lockless, hash insertion must be done after * starting the timer and setting the CONFIRMED bit. The RCU barriers * guarantee that no other CPU can find the conntrack before the above @@ -655,7 +670,8 @@ __nf_conntrack_alloc(struct net *net, u16 zone, * and ct->tuplehash[IP_CT_DIR_REPLY].hnnode.next unchanged. */ memset(&ct->tuplehash[IP_CT_DIR_MAX], 0, - sizeof(*ct) - offsetof(struct nf_conn, tuplehash[IP_CT_DIR_MAX])); + offsetof(struct nf_conn, proto) - + offsetof(struct nf_conn, tuplehash[IP_CT_DIR_MAX])); spin_lock_init(&ct->lock); ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig; ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode.pprev = NULL; @@ -745,6 +761,7 @@ init_conntrack(struct net *net, struct nf_conn *tmpl, } nf_ct_acct_ext_add(ct, GFP_ATOMIC); + nf_ct_tstamp_ext_add(ct, GFP_ATOMIC); ecache = tmpl ? nf_ct_ecache_find(tmpl) : NULL; nf_ct_ecache_ext_add(ct, ecache ? ecache->ctmask : 0, @@ -1185,6 +1202,11 @@ struct __nf_ct_flush_report { static int kill_report(struct nf_conn *i, void *data) { struct __nf_ct_flush_report *fr = (struct __nf_ct_flush_report *)data; + struct nf_conn_tstamp *tstamp; + + tstamp = nf_conn_tstamp_find(i); + if (tstamp && tstamp->stop == 0) + tstamp->stop = ktime_to_ns(ktime_get_real()); /* If we fail to deliver the event, death_by_timeout() will retry */ if (nf_conntrack_event_report(IPCT_DESTROY, i, @@ -1201,9 +1223,9 @@ static int kill_all(struct nf_conn *i, void *data) return 1; } -void nf_ct_free_hashtable(void *hash, int vmalloced, unsigned int size) +void nf_ct_free_hashtable(void *hash, unsigned int size) { - if (vmalloced) + if (is_vmalloc_addr(hash)) vfree(hash); else free_pages((unsigned long)hash, @@ -1270,8 +1292,7 @@ static void nf_conntrack_cleanup_net(struct net *net) goto i_see_dead_people; } - nf_ct_free_hashtable(net->ct.hash, net->ct.hash_vmalloc, - net->ct.htable_size); + nf_ct_free_hashtable(net->ct.hash, net->ct.htable_size); nf_conntrack_ecache_fini(net); nf_conntrack_acct_fini(net); nf_conntrack_expect_fini(net); @@ -1300,21 +1321,18 @@ void nf_conntrack_cleanup(struct net *net) } } -void *nf_ct_alloc_hashtable(unsigned int *sizep, int *vmalloced, int nulls) +void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls) { struct hlist_nulls_head *hash; unsigned int nr_slots, i; size_t sz; - *vmalloced = 0; - BUILD_BUG_ON(sizeof(struct hlist_nulls_head) != sizeof(struct hlist_head)); nr_slots = *sizep = roundup(*sizep, PAGE_SIZE / sizeof(struct hlist_nulls_head)); sz = nr_slots * sizeof(struct hlist_nulls_head); hash = (void *)__get_free_pages(GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO, get_order(sz)); if (!hash) { - *vmalloced = 1; printk(KERN_WARNING "nf_conntrack: falling back to vmalloc.\n"); hash = __vmalloc(sz, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL); @@ -1330,7 +1348,7 @@ EXPORT_SYMBOL_GPL(nf_ct_alloc_hashtable); int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp) { - int i, bucket, vmalloced, old_vmalloced; + int i, bucket; unsigned int hashsize, old_size; struct hlist_nulls_head *hash, *old_hash; struct nf_conntrack_tuple_hash *h; @@ -1347,7 +1365,7 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp) if (!hashsize) return -EINVAL; - hash = nf_ct_alloc_hashtable(&hashsize, &vmalloced, 1); + hash = nf_ct_alloc_hashtable(&hashsize, 1); if (!hash) return -ENOMEM; @@ -1369,15 +1387,13 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp) } } old_size = init_net.ct.htable_size; - old_vmalloced = init_net.ct.hash_vmalloc; old_hash = init_net.ct.hash; init_net.ct.htable_size = nf_conntrack_htable_size = hashsize; - init_net.ct.hash_vmalloc = vmalloced; init_net.ct.hash = hash; spin_unlock_bh(&nf_conntrack_lock); - nf_ct_free_hashtable(old_hash, old_vmalloced, old_size); + nf_ct_free_hashtable(old_hash, old_size); return 0; } EXPORT_SYMBOL_GPL(nf_conntrack_set_hashsize); @@ -1490,8 +1506,7 @@ static int nf_conntrack_init_net(struct net *net) } net->ct.htable_size = nf_conntrack_htable_size; - net->ct.hash = nf_ct_alloc_hashtable(&net->ct.htable_size, - &net->ct.hash_vmalloc, 1); + net->ct.hash = nf_ct_alloc_hashtable(&net->ct.htable_size, 1); if (!net->ct.hash) { ret = -ENOMEM; printk(KERN_ERR "Unable to create nf_conntrack_hash\n"); @@ -1503,6 +1518,9 @@ static int nf_conntrack_init_net(struct net *net) ret = nf_conntrack_acct_init(net); if (ret < 0) goto err_acct; + ret = nf_conntrack_tstamp_init(net); + if (ret < 0) + goto err_tstamp; ret = nf_conntrack_ecache_init(net); if (ret < 0) goto err_ecache; @@ -1510,12 +1528,13 @@ static int nf_conntrack_init_net(struct net *net) return 0; err_ecache: + nf_conntrack_tstamp_fini(net); +err_tstamp: nf_conntrack_acct_fini(net); err_acct: nf_conntrack_expect_fini(net); err_expect: - nf_ct_free_hashtable(net->ct.hash, net->ct.hash_vmalloc, - net->ct.htable_size); + nf_ct_free_hashtable(net->ct.hash, net->ct.htable_size); err_hash: kmem_cache_destroy(net->ct.nf_conntrack_cachep); err_cache: diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index a20fb0bd1ef..cd1e8e0970f 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -319,7 +319,8 @@ static void nf_ct_expect_insert(struct nf_conntrack_expect *exp) const struct nf_conntrack_expect_policy *p; unsigned int h = nf_ct_expect_dst_hash(&exp->tuple); - atomic_inc(&exp->use); + /* two references : one for hash insert, one for the timer */ + atomic_add(2, &exp->use); if (master_help) { hlist_add_head(&exp->lnode, &master_help->expectations); @@ -333,12 +334,14 @@ static void nf_ct_expect_insert(struct nf_conntrack_expect *exp) setup_timer(&exp->timeout, nf_ct_expectation_timed_out, (unsigned long)exp); if (master_help) { - p = &master_help->helper->expect_policy[exp->class]; + p = &rcu_dereference_protected( + master_help->helper, + lockdep_is_held(&nf_conntrack_lock) + )->expect_policy[exp->class]; exp->timeout.expires = jiffies + p->timeout * HZ; } add_timer(&exp->timeout); - atomic_inc(&exp->use); NF_CT_STAT_INC(net, expect_create); } @@ -369,7 +372,10 @@ static inline int refresh_timer(struct nf_conntrack_expect *i) if (!del_timer(&i->timeout)) return 0; - p = &master_help->helper->expect_policy[i->class]; + p = &rcu_dereference_protected( + master_help->helper, + lockdep_is_held(&nf_conntrack_lock) + )->expect_policy[i->class]; i->timeout.expires = jiffies + p->timeout * HZ; add_timer(&i->timeout); return 1; @@ -407,7 +413,10 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect) } /* Will be over limit? */ if (master_help) { - p = &master_help->helper->expect_policy[expect->class]; + p = &rcu_dereference_protected( + master_help->helper, + lockdep_is_held(&nf_conntrack_lock) + )->expect_policy[expect->class]; if (p->max_expected && master_help->expecting[expect->class] >= p->max_expected) { evict_oldest_expect(master, expect); @@ -478,7 +487,7 @@ static struct hlist_node *ct_expect_get_first(struct seq_file *seq) struct hlist_node *n; for (st->bucket = 0; st->bucket < nf_ct_expect_hsize; st->bucket++) { - n = rcu_dereference(net->ct.expect_hash[st->bucket].first); + n = rcu_dereference(hlist_first_rcu(&net->ct.expect_hash[st->bucket])); if (n) return n; } @@ -491,11 +500,11 @@ static struct hlist_node *ct_expect_get_next(struct seq_file *seq, struct net *net = seq_file_net(seq); struct ct_expect_iter_state *st = seq->private; - head = rcu_dereference(head->next); + head = rcu_dereference(hlist_next_rcu(head)); while (head == NULL) { if (++st->bucket >= nf_ct_expect_hsize) return NULL; - head = rcu_dereference(net->ct.expect_hash[st->bucket].first); + head = rcu_dereference(hlist_first_rcu(&net->ct.expect_hash[st->bucket])); } return head; } @@ -630,8 +639,7 @@ int nf_conntrack_expect_init(struct net *net) } net->ct.expect_count = 0; - net->ct.expect_hash = nf_ct_alloc_hashtable(&nf_ct_expect_hsize, - &net->ct.expect_vmalloc, 0); + net->ct.expect_hash = nf_ct_alloc_hashtable(&nf_ct_expect_hsize, 0); if (net->ct.expect_hash == NULL) goto err1; @@ -653,8 +661,7 @@ err3: if (net_eq(net, &init_net)) kmem_cache_destroy(nf_ct_expect_cachep); err2: - nf_ct_free_hashtable(net->ct.expect_hash, net->ct.expect_vmalloc, - nf_ct_expect_hsize); + nf_ct_free_hashtable(net->ct.expect_hash, nf_ct_expect_hsize); err1: return err; } @@ -666,6 +673,5 @@ void nf_conntrack_expect_fini(struct net *net) rcu_barrier(); /* Wait for call_rcu() before destroy */ kmem_cache_destroy(nf_ct_expect_cachep); } - nf_ct_free_hashtable(net->ct.expect_hash, net->ct.expect_vmalloc, - nf_ct_expect_hsize); + nf_ct_free_hashtable(net->ct.expect_hash, nf_ct_expect_hsize); } diff --git a/net/netfilter/nf_conntrack_extend.c b/net/netfilter/nf_conntrack_extend.c index bd82450c193..80a23ed62bb 100644 --- a/net/netfilter/nf_conntrack_extend.c +++ b/net/netfilter/nf_conntrack_extend.c @@ -140,15 +140,16 @@ static void update_alloc_size(struct nf_ct_ext_type *type) /* This assumes that extended areas in conntrack for the types whose NF_CT_EXT_F_PREALLOC bit set are allocated in order */ for (i = min; i <= max; i++) { - t1 = nf_ct_ext_types[i]; + t1 = rcu_dereference_protected(nf_ct_ext_types[i], + lockdep_is_held(&nf_ct_ext_type_mutex)); if (!t1) continue; - t1->alloc_size = sizeof(struct nf_ct_ext) - + ALIGN(sizeof(struct nf_ct_ext), t1->align) - + t1->len; + t1->alloc_size = ALIGN(sizeof(struct nf_ct_ext), t1->align) + + t1->len; for (j = 0; j < NF_CT_EXT_NUM; j++) { - t2 = nf_ct_ext_types[j]; + t2 = rcu_dereference_protected(nf_ct_ext_types[j], + lockdep_is_held(&nf_ct_ext_type_mutex)); if (t2 == NULL || t2 == t1 || (t2->flags & NF_CT_EXT_F_PREALLOC) == 0) continue; diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index 59e1a4cd4e8..1bdfea35795 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -33,7 +33,6 @@ static DEFINE_MUTEX(nf_ct_helper_mutex); static struct hlist_head *nf_ct_helper_hash __read_mostly; static unsigned int nf_ct_helper_hsize __read_mostly; static unsigned int nf_ct_helper_count __read_mostly; -static int nf_ct_helper_vmalloc; /* Stupid hash, but collision free for the default registrations of the @@ -158,7 +157,10 @@ static inline int unhelp(struct nf_conntrack_tuple_hash *i, struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(i); struct nf_conn_help *help = nfct_help(ct); - if (help && help->helper == me) { + if (help && rcu_dereference_protected( + help->helper, + lockdep_is_held(&nf_conntrack_lock) + ) == me) { nf_conntrack_event(IPCT_HELPER, ct); rcu_assign_pointer(help->helper, NULL); } @@ -210,7 +212,10 @@ static void __nf_conntrack_helper_unregister(struct nf_conntrack_helper *me, hlist_for_each_entry_safe(exp, n, next, &net->ct.expect_hash[i], hnode) { struct nf_conn_help *help = nfct_help(exp->master); - if ((help->helper == me || exp->helper == me) && + if ((rcu_dereference_protected( + help->helper, + lockdep_is_held(&nf_conntrack_lock) + ) == me || exp->helper == me) && del_timer(&exp->timeout)) { nf_ct_unlink_expect(exp); nf_ct_expect_put(exp); @@ -261,8 +266,7 @@ int nf_conntrack_helper_init(void) int err; nf_ct_helper_hsize = 1; /* gets rounded up to use one page */ - nf_ct_helper_hash = nf_ct_alloc_hashtable(&nf_ct_helper_hsize, - &nf_ct_helper_vmalloc, 0); + nf_ct_helper_hash = nf_ct_alloc_hashtable(&nf_ct_helper_hsize, 0); if (!nf_ct_helper_hash) return -ENOMEM; @@ -273,14 +277,12 @@ int nf_conntrack_helper_init(void) return 0; err1: - nf_ct_free_hashtable(nf_ct_helper_hash, nf_ct_helper_vmalloc, - nf_ct_helper_hsize); + nf_ct_free_hashtable(nf_ct_helper_hash, nf_ct_helper_hsize); return err; } void nf_conntrack_helper_fini(void) { nf_ct_extend_unregister(&helper_extend); - nf_ct_free_hashtable(nf_ct_helper_hash, nf_ct_helper_vmalloc, - nf_ct_helper_hsize); + nf_ct_free_hashtable(nf_ct_helper_hash, nf_ct_helper_hsize); } diff --git a/net/netfilter/nf_conntrack_netbios_ns.c b/net/netfilter/nf_conntrack_netbios_ns.c index aadde018a07..4c8f30a3d6d 100644 --- a/net/netfilter/nf_conntrack_netbios_ns.c +++ b/net/netfilter/nf_conntrack_netbios_ns.c @@ -18,14 +18,7 @@ #include <linux/kernel.h> #include <linux/module.h> #include <linux/init.h> -#include <linux/skbuff.h> -#include <linux/netdevice.h> -#include <linux/inetdevice.h> -#include <linux/if_addr.h> #include <linux/in.h> -#include <linux/ip.h> -#include <linux/netfilter.h> -#include <net/route.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_helper.h> @@ -40,75 +33,26 @@ MODULE_ALIAS("ip_conntrack_netbios_ns"); MODULE_ALIAS_NFCT_HELPER("netbios_ns"); static unsigned int timeout __read_mostly = 3; -module_param(timeout, uint, 0400); +module_param(timeout, uint, S_IRUSR); MODULE_PARM_DESC(timeout, "timeout for master connection/replies in seconds"); -static int help(struct sk_buff *skb, unsigned int protoff, - struct nf_conn *ct, enum ip_conntrack_info ctinfo) -{ - struct nf_conntrack_expect *exp; - struct iphdr *iph = ip_hdr(skb); - struct rtable *rt = skb_rtable(skb); - struct in_device *in_dev; - __be32 mask = 0; - - /* we're only interested in locally generated packets */ - if (skb->sk == NULL) - goto out; - if (rt == NULL || !(rt->rt_flags & RTCF_BROADCAST)) - goto out; - if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) - goto out; - - rcu_read_lock(); - in_dev = __in_dev_get_rcu(rt->dst.dev); - if (in_dev != NULL) { - for_primary_ifa(in_dev) { - if (ifa->ifa_broadcast == iph->daddr) { - mask = ifa->ifa_mask; - break; - } - } endfor_ifa(in_dev); - } - rcu_read_unlock(); - - if (mask == 0) - goto out; - - exp = nf_ct_expect_alloc(ct); - if (exp == NULL) - goto out; - - exp->tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple; - exp->tuple.src.u.udp.port = htons(NMBD_PORT); - - exp->mask.src.u3.ip = mask; - exp->mask.src.u.udp.port = htons(0xFFFF); - - exp->expectfn = NULL; - exp->flags = NF_CT_EXPECT_PERMANENT; - exp->class = NF_CT_EXPECT_CLASS_DEFAULT; - exp->helper = NULL; - - nf_ct_expect_related(exp); - nf_ct_expect_put(exp); - - nf_ct_refresh(ct, skb, timeout * HZ); -out: - return NF_ACCEPT; -} - static struct nf_conntrack_expect_policy exp_policy = { .max_expected = 1, }; +static int netbios_ns_help(struct sk_buff *skb, unsigned int protoff, + struct nf_conn *ct, enum ip_conntrack_info ctinfo) +{ + return nf_conntrack_broadcast_help(skb, protoff, ct, ctinfo, timeout); +} + static struct nf_conntrack_helper helper __read_mostly = { .name = "netbios-ns", - .tuple.src.l3num = AF_INET, + .tuple.src.l3num = NFPROTO_IPV4, .tuple.src.u.udp.port = cpu_to_be16(NMBD_PORT), .tuple.dst.protonum = IPPROTO_UDP, .me = THIS_MODULE, - .help = help, + .help = netbios_ns_help, .expect_policy = &exp_policy, }; diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 93297aaceb2..3fec12c570a 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -42,6 +42,7 @@ #include <net/netfilter/nf_conntrack_tuple.h> #include <net/netfilter/nf_conntrack_acct.h> #include <net/netfilter/nf_conntrack_zones.h> +#include <net/netfilter/nf_conntrack_timestamp.h> #ifdef CONFIG_NF_NAT_NEEDED #include <net/netfilter/nf_nat_core.h> #include <net/netfilter/nf_nat_protocol.h> @@ -230,6 +231,33 @@ nla_put_failure: return -1; } +static int +ctnetlink_dump_timestamp(struct sk_buff *skb, const struct nf_conn *ct) +{ + struct nlattr *nest_count; + const struct nf_conn_tstamp *tstamp; + + tstamp = nf_conn_tstamp_find(ct); + if (!tstamp) + return 0; + + nest_count = nla_nest_start(skb, CTA_TIMESTAMP | NLA_F_NESTED); + if (!nest_count) + goto nla_put_failure; + + NLA_PUT_BE64(skb, CTA_TIMESTAMP_START, cpu_to_be64(tstamp->start)); + if (tstamp->stop != 0) { + NLA_PUT_BE64(skb, CTA_TIMESTAMP_STOP, + cpu_to_be64(tstamp->stop)); + } + nla_nest_end(skb, nest_count); + + return 0; + +nla_put_failure: + return -1; +} + #ifdef CONFIG_NF_CONNTRACK_MARK static inline int ctnetlink_dump_mark(struct sk_buff *skb, const struct nf_conn *ct) @@ -404,6 +432,7 @@ ctnetlink_fill_info(struct sk_buff *skb, u32 pid, u32 seq, ctnetlink_dump_timeout(skb, ct) < 0 || ctnetlink_dump_counters(skb, ct, IP_CT_DIR_ORIGINAL) < 0 || ctnetlink_dump_counters(skb, ct, IP_CT_DIR_REPLY) < 0 || + ctnetlink_dump_timestamp(skb, ct) < 0 || ctnetlink_dump_protoinfo(skb, ct) < 0 || ctnetlink_dump_helpinfo(skb, ct) < 0 || ctnetlink_dump_mark(skb, ct) < 0 || @@ -471,6 +500,18 @@ ctnetlink_secctx_size(const struct nf_conn *ct) } static inline size_t +ctnetlink_timestamp_size(const struct nf_conn *ct) +{ +#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP + if (!nf_ct_ext_exist(ct, NF_CT_EXT_TSTAMP)) + return 0; + return nla_total_size(0) + 2 * nla_total_size(sizeof(uint64_t)); +#else + return 0; +#endif +} + +static inline size_t ctnetlink_nlmsg_size(const struct nf_conn *ct) { return NLMSG_ALIGN(sizeof(struct nfgenmsg)) @@ -481,6 +522,7 @@ ctnetlink_nlmsg_size(const struct nf_conn *ct) + nla_total_size(sizeof(u_int32_t)) /* CTA_ID */ + nla_total_size(sizeof(u_int32_t)) /* CTA_STATUS */ + ctnetlink_counters_size(ct) + + ctnetlink_timestamp_size(ct) + nla_total_size(sizeof(u_int32_t)) /* CTA_TIMEOUT */ + nla_total_size(0) /* CTA_PROTOINFO */ + nla_total_size(0) /* CTA_HELP */ @@ -571,7 +613,8 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item) if (events & (1 << IPCT_DESTROY)) { if (ctnetlink_dump_counters(skb, ct, IP_CT_DIR_ORIGINAL) < 0 || - ctnetlink_dump_counters(skb, ct, IP_CT_DIR_REPLY) < 0) + ctnetlink_dump_counters(skb, ct, IP_CT_DIR_REPLY) < 0 || + ctnetlink_dump_timestamp(skb, ct) < 0) goto nla_put_failure; } else { if (ctnetlink_dump_timeout(skb, ct) < 0) @@ -1357,6 +1400,7 @@ ctnetlink_create_conntrack(struct net *net, u16 zone, } nf_ct_acct_ext_add(ct, GFP_ATOMIC); + nf_ct_tstamp_ext_add(ct, GFP_ATOMIC); nf_ct_ecache_ext_add(ct, 0, 0, GFP_ATOMIC); /* we must add conntrack extensions before confirmation. */ ct->status |= IPS_CONFIRMED; @@ -1375,6 +1419,7 @@ ctnetlink_create_conntrack(struct net *net, u16 zone, } #endif + memset(&ct->proto, 0, sizeof(ct->proto)); if (cda[CTA_PROTOINFO]) { err = ctnetlink_change_protoinfo(ct, cda); if (err < 0) diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c index dc7bb74110d..5701c8dd783 100644 --- a/net/netfilter/nf_conntrack_proto.c +++ b/net/netfilter/nf_conntrack_proto.c @@ -166,6 +166,7 @@ static void nf_ct_l3proto_unregister_sysctl(struct nf_conntrack_l3proto *l3proto int nf_conntrack_l3proto_register(struct nf_conntrack_l3proto *proto) { int ret = 0; + struct nf_conntrack_l3proto *old; if (proto->l3proto >= AF_MAX) return -EBUSY; @@ -174,7 +175,9 @@ int nf_conntrack_l3proto_register(struct nf_conntrack_l3proto *proto) return -EINVAL; mutex_lock(&nf_ct_proto_mutex); - if (nf_ct_l3protos[proto->l3proto] != &nf_conntrack_l3proto_generic) { + old = rcu_dereference_protected(nf_ct_l3protos[proto->l3proto], + lockdep_is_held(&nf_ct_proto_mutex)); + if (old != &nf_conntrack_l3proto_generic) { ret = -EBUSY; goto out_unlock; } @@ -201,7 +204,9 @@ void nf_conntrack_l3proto_unregister(struct nf_conntrack_l3proto *proto) BUG_ON(proto->l3proto >= AF_MAX); mutex_lock(&nf_ct_proto_mutex); - BUG_ON(nf_ct_l3protos[proto->l3proto] != proto); + BUG_ON(rcu_dereference_protected(nf_ct_l3protos[proto->l3proto], + lockdep_is_held(&nf_ct_proto_mutex) + ) != proto); rcu_assign_pointer(nf_ct_l3protos[proto->l3proto], &nf_conntrack_l3proto_generic); nf_ct_l3proto_unregister_sysctl(proto); @@ -279,7 +284,7 @@ int nf_conntrack_l4proto_register(struct nf_conntrack_l4proto *l4proto) mutex_lock(&nf_ct_proto_mutex); if (!nf_ct_protos[l4proto->l3proto]) { /* l3proto may be loaded latter. */ - struct nf_conntrack_l4proto **proto_array; + struct nf_conntrack_l4proto __rcu **proto_array; int i; proto_array = kmalloc(MAX_NF_CT_PROTO * @@ -291,7 +296,7 @@ int nf_conntrack_l4proto_register(struct nf_conntrack_l4proto *l4proto) } for (i = 0; i < MAX_NF_CT_PROTO; i++) - proto_array[i] = &nf_conntrack_l4proto_generic; + RCU_INIT_POINTER(proto_array[i], &nf_conntrack_l4proto_generic); /* Before making proto_array visible to lockless readers, * we must make sure its content is committed to memory. @@ -299,8 +304,10 @@ int nf_conntrack_l4proto_register(struct nf_conntrack_l4proto *l4proto) smp_wmb(); nf_ct_protos[l4proto->l3proto] = proto_array; - } else if (nf_ct_protos[l4proto->l3proto][l4proto->l4proto] != - &nf_conntrack_l4proto_generic) { + } else if (rcu_dereference_protected( + nf_ct_protos[l4proto->l3proto][l4proto->l4proto], + lockdep_is_held(&nf_ct_proto_mutex) + ) != &nf_conntrack_l4proto_generic) { ret = -EBUSY; goto out_unlock; } @@ -331,7 +338,10 @@ void nf_conntrack_l4proto_unregister(struct nf_conntrack_l4proto *l4proto) BUG_ON(l4proto->l3proto >= PF_MAX); mutex_lock(&nf_ct_proto_mutex); - BUG_ON(nf_ct_protos[l4proto->l3proto][l4proto->l4proto] != l4proto); + BUG_ON(rcu_dereference_protected( + nf_ct_protos[l4proto->l3proto][l4proto->l4proto], + lockdep_is_held(&nf_ct_proto_mutex) + ) != l4proto); rcu_assign_pointer(nf_ct_protos[l4proto->l3proto][l4proto->l4proto], &nf_conntrack_l4proto_generic); nf_ct_l4proto_unregister_sysctl(l4proto); diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c index 5292560d6d4..9ae57c57c50 100644 --- a/net/netfilter/nf_conntrack_proto_dccp.c +++ b/net/netfilter/nf_conntrack_proto_dccp.c @@ -452,6 +452,9 @@ static bool dccp_new(struct nf_conn *ct, const struct sk_buff *skb, ct->proto.dccp.role[IP_CT_DIR_ORIGINAL] = CT_DCCP_ROLE_CLIENT; ct->proto.dccp.role[IP_CT_DIR_REPLY] = CT_DCCP_ROLE_SERVER; ct->proto.dccp.state = CT_DCCP_NONE; + ct->proto.dccp.last_pkt = DCCP_PKT_REQUEST; + ct->proto.dccp.last_dir = IP_CT_DIR_ORIGINAL; + ct->proto.dccp.handshake_seq = 0; return true; out_invalid: diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c index c6049c2d5ea..6f4ee70f460 100644 --- a/net/netfilter/nf_conntrack_proto_sctp.c +++ b/net/netfilter/nf_conntrack_proto_sctp.c @@ -413,6 +413,7 @@ static bool sctp_new(struct nf_conn *ct, const struct sk_buff *skb, test_bit(SCTP_CID_COOKIE_ACK, map)) return false; + memset(&ct->proto.sctp, 0, sizeof(ct->proto.sctp)); new_state = SCTP_CONNTRACK_MAX; for_each_sctp_chunk (skb, sch, _sch, offset, dataoff, count) { /* Don't need lock here: this conntrack not in circulation yet */ diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index 3fb2b73b24d..6f38d0e2ea4 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -1066,9 +1066,7 @@ static bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb, BUG_ON(th == NULL); /* Don't need lock here: this conntrack not in circulation yet */ - new_state - = tcp_conntracks[0][get_conntrack_index(th)] - [TCP_CONNTRACK_NONE]; + new_state = tcp_conntracks[0][get_conntrack_index(th)][TCP_CONNTRACK_NONE]; /* Invalid: delete conntrack */ if (new_state >= TCP_CONNTRACK_MAX) { @@ -1077,6 +1075,7 @@ static bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb, } if (new_state == TCP_CONNTRACK_SYN_SENT) { + memset(&ct->proto.tcp, 0, sizeof(ct->proto.tcp)); /* SYN packet */ ct->proto.tcp.seen[0].td_end = segment_seq_plus_len(ntohl(th->seq), skb->len, @@ -1088,11 +1087,11 @@ static bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb, ct->proto.tcp.seen[0].td_end; tcp_options(skb, dataoff, th, &ct->proto.tcp.seen[0]); - ct->proto.tcp.seen[1].flags = 0; } else if (nf_ct_tcp_loose == 0) { /* Don't try to pick up connections. */ return false; } else { + memset(&ct->proto.tcp, 0, sizeof(ct->proto.tcp)); /* * We are in the middle of a connection, * its history is lost for us. @@ -1107,7 +1106,6 @@ static bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb, ct->proto.tcp.seen[0].td_maxend = ct->proto.tcp.seen[0].td_end + ct->proto.tcp.seen[0].td_maxwin; - ct->proto.tcp.seen[0].td_scale = 0; /* We assume SACK and liberal window checking to handle * window scaling */ @@ -1116,13 +1114,7 @@ static bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb, IP_CT_TCP_FLAG_BE_LIBERAL; } - ct->proto.tcp.seen[1].td_end = 0; - ct->proto.tcp.seen[1].td_maxend = 0; - ct->proto.tcp.seen[1].td_maxwin = 0; - ct->proto.tcp.seen[1].td_scale = 0; - /* tcp_packet will set them */ - ct->proto.tcp.state = TCP_CONNTRACK_NONE; ct->proto.tcp.last_index = TCP_NONE_SET; pr_debug("tcp_new: sender end=%u maxend=%u maxwin=%u scale=%i " diff --git a/net/netfilter/nf_conntrack_snmp.c b/net/netfilter/nf_conntrack_snmp.c new file mode 100644 index 00000000000..6e545e26289 --- /dev/null +++ b/net/netfilter/nf_conntrack_snmp.c @@ -0,0 +1,77 @@ +/* + * SNMP service broadcast connection tracking helper + * + * (c) 2011 Jiri Olsa <jolsa@redhat.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/in.h> + +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_helper.h> +#include <net/netfilter/nf_conntrack_expect.h> + +#define SNMP_PORT 161 + +MODULE_AUTHOR("Jiri Olsa <jolsa@redhat.com>"); +MODULE_DESCRIPTION("SNMP service broadcast connection tracking helper"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NFCT_HELPER("snmp"); + +static unsigned int timeout __read_mostly = 30; +module_param(timeout, uint, S_IRUSR); +MODULE_PARM_DESC(timeout, "timeout for master connection/replies in seconds"); + +int (*nf_nat_snmp_hook)(struct sk_buff *skb, + unsigned int protoff, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo); +EXPORT_SYMBOL_GPL(nf_nat_snmp_hook); + +static int snmp_conntrack_help(struct sk_buff *skb, unsigned int protoff, + struct nf_conn *ct, enum ip_conntrack_info ctinfo) +{ + typeof(nf_nat_snmp_hook) nf_nat_snmp; + + nf_conntrack_broadcast_help(skb, protoff, ct, ctinfo, timeout); + + nf_nat_snmp = rcu_dereference(nf_nat_snmp_hook); + if (nf_nat_snmp && ct->status & IPS_NAT_MASK) + return nf_nat_snmp(skb, protoff, ct, ctinfo); + + return NF_ACCEPT; +} + +static struct nf_conntrack_expect_policy exp_policy = { + .max_expected = 1, +}; + +static struct nf_conntrack_helper helper __read_mostly = { + .name = "snmp", + .tuple.src.l3num = NFPROTO_IPV4, + .tuple.src.u.udp.port = cpu_to_be16(SNMP_PORT), + .tuple.dst.protonum = IPPROTO_UDP, + .me = THIS_MODULE, + .help = snmp_conntrack_help, + .expect_policy = &exp_policy, +}; + +static int __init nf_conntrack_snmp_init(void) +{ + exp_policy.timeout = timeout; + return nf_conntrack_helper_register(&helper); +} + +static void __exit nf_conntrack_snmp_fini(void) +{ + nf_conntrack_helper_unregister(&helper); +} + +module_init(nf_conntrack_snmp_init); +module_exit(nf_conntrack_snmp_fini); diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index b4d7f0f24b2..0ae14282588 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -29,6 +29,8 @@ #include <net/netfilter/nf_conntrack_helper.h> #include <net/netfilter/nf_conntrack_acct.h> #include <net/netfilter/nf_conntrack_zones.h> +#include <net/netfilter/nf_conntrack_timestamp.h> +#include <linux/rculist_nulls.h> MODULE_LICENSE("GPL"); @@ -45,6 +47,7 @@ EXPORT_SYMBOL_GPL(print_tuple); struct ct_iter_state { struct seq_net_private p; unsigned int bucket; + u_int64_t time_now; }; static struct hlist_nulls_node *ct_get_first(struct seq_file *seq) @@ -56,7 +59,7 @@ static struct hlist_nulls_node *ct_get_first(struct seq_file *seq) for (st->bucket = 0; st->bucket < net->ct.htable_size; st->bucket++) { - n = rcu_dereference(net->ct.hash[st->bucket].first); + n = rcu_dereference(hlist_nulls_first_rcu(&net->ct.hash[st->bucket])); if (!is_a_nulls(n)) return n; } @@ -69,13 +72,15 @@ static struct hlist_nulls_node *ct_get_next(struct seq_file *seq, struct net *net = seq_file_net(seq); struct ct_iter_state *st = seq->private; - head = rcu_dereference(head->next); + head = rcu_dereference(hlist_nulls_next_rcu(head)); while (is_a_nulls(head)) { if (likely(get_nulls_value(head) == st->bucket)) { if (++st->bucket >= net->ct.htable_size) return NULL; } - head = rcu_dereference(net->ct.hash[st->bucket].first); + head = rcu_dereference( + hlist_nulls_first_rcu( + &net->ct.hash[st->bucket])); } return head; } @@ -93,6 +98,9 @@ static struct hlist_nulls_node *ct_get_idx(struct seq_file *seq, loff_t pos) static void *ct_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { + struct ct_iter_state *st = seq->private; + + st->time_now = ktime_to_ns(ktime_get_real()); rcu_read_lock(); return ct_get_idx(seq, *pos); } @@ -132,6 +140,34 @@ static inline int ct_show_secctx(struct seq_file *s, const struct nf_conn *ct) } #endif +#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP +static int ct_show_delta_time(struct seq_file *s, const struct nf_conn *ct) +{ + struct ct_iter_state *st = s->private; + struct nf_conn_tstamp *tstamp; + s64 delta_time; + + tstamp = nf_conn_tstamp_find(ct); + if (tstamp) { + delta_time = st->time_now - tstamp->start; + if (delta_time > 0) + delta_time = div_s64(delta_time, NSEC_PER_SEC); + else + delta_time = 0; + + return seq_printf(s, "delta-time=%llu ", + (unsigned long long)delta_time); + } + return 0; +} +#else +static inline int +ct_show_delta_time(struct seq_file *s, const struct nf_conn *ct) +{ + return 0; +} +#endif + /* return 0 on success, 1 in case of error */ static int ct_seq_show(struct seq_file *s, void *v) { @@ -200,6 +236,9 @@ static int ct_seq_show(struct seq_file *s, void *v) goto release; #endif + if (ct_show_delta_time(s, ct)) + goto release; + if (seq_printf(s, "use=%u\n", atomic_read(&ct->ct_general.use))) goto release; diff --git a/net/netfilter/nf_conntrack_timestamp.c b/net/netfilter/nf_conntrack_timestamp.c new file mode 100644 index 00000000000..af7dd31af0a --- /dev/null +++ b/net/netfilter/nf_conntrack_timestamp.c @@ -0,0 +1,120 @@ +/* + * (C) 2010 Pablo Neira Ayuso <pablo@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation (or any later at your option). + */ + +#include <linux/netfilter.h> +#include <linux/slab.h> +#include <linux/kernel.h> +#include <linux/moduleparam.h> + +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_extend.h> +#include <net/netfilter/nf_conntrack_timestamp.h> + +static int nf_ct_tstamp __read_mostly; + +module_param_named(tstamp, nf_ct_tstamp, bool, 0644); +MODULE_PARM_DESC(tstamp, "Enable connection tracking flow timestamping."); + +#ifdef CONFIG_SYSCTL +static struct ctl_table tstamp_sysctl_table[] = { + { + .procname = "nf_conntrack_timestamp", + .data = &init_net.ct.sysctl_tstamp, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + {} +}; +#endif /* CONFIG_SYSCTL */ + +static struct nf_ct_ext_type tstamp_extend __read_mostly = { + .len = sizeof(struct nf_conn_tstamp), + .align = __alignof__(struct nf_conn_tstamp), + .id = NF_CT_EXT_TSTAMP, +}; + +#ifdef CONFIG_SYSCTL +static int nf_conntrack_tstamp_init_sysctl(struct net *net) +{ + struct ctl_table *table; + + table = kmemdup(tstamp_sysctl_table, sizeof(tstamp_sysctl_table), + GFP_KERNEL); + if (!table) + goto out; + + table[0].data = &net->ct.sysctl_tstamp; + + net->ct.tstamp_sysctl_header = register_net_sysctl_table(net, + nf_net_netfilter_sysctl_path, table); + if (!net->ct.tstamp_sysctl_header) { + printk(KERN_ERR "nf_ct_tstamp: can't register to sysctl.\n"); + goto out_register; + } + return 0; + +out_register: + kfree(table); +out: + return -ENOMEM; +} + +static void nf_conntrack_tstamp_fini_sysctl(struct net *net) +{ + struct ctl_table *table; + + table = net->ct.tstamp_sysctl_header->ctl_table_arg; + unregister_net_sysctl_table(net->ct.tstamp_sysctl_header); + kfree(table); +} +#else +static int nf_conntrack_tstamp_init_sysctl(struct net *net) +{ + return 0; +} + +static void nf_conntrack_tstamp_fini_sysctl(struct net *net) +{ +} +#endif + +int nf_conntrack_tstamp_init(struct net *net) +{ + int ret; + + net->ct.sysctl_tstamp = nf_ct_tstamp; + + if (net_eq(net, &init_net)) { + ret = nf_ct_extend_register(&tstamp_extend); + if (ret < 0) { + printk(KERN_ERR "nf_ct_tstamp: Unable to register " + "extension\n"); + goto out_extend_register; + } + } + + ret = nf_conntrack_tstamp_init_sysctl(net); + if (ret < 0) + goto out_sysctl; + + return 0; + +out_sysctl: + if (net_eq(net, &init_net)) + nf_ct_extend_unregister(&tstamp_extend); +out_extend_register: + return ret; +} + +void nf_conntrack_tstamp_fini(struct net *net) +{ + nf_conntrack_tstamp_fini_sysctl(net); + if (net_eq(net, &init_net)) + nf_ct_extend_unregister(&tstamp_extend); +} diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c index b07393eab88..20c775cff2a 100644 --- a/net/netfilter/nf_log.c +++ b/net/netfilter/nf_log.c @@ -161,7 +161,8 @@ static int seq_show(struct seq_file *s, void *v) struct nf_logger *t; int ret; - logger = nf_loggers[*pos]; + logger = rcu_dereference_protected(nf_loggers[*pos], + lockdep_is_held(&nf_log_mutex)); if (!logger) ret = seq_printf(s, "%2lld NONE (", *pos); @@ -249,7 +250,8 @@ static int nf_log_proc_dostring(ctl_table *table, int write, mutex_unlock(&nf_log_mutex); } else { mutex_lock(&nf_log_mutex); - logger = nf_loggers[tindex]; + logger = rcu_dereference_protected(nf_loggers[tindex], + lockdep_is_held(&nf_log_mutex)); if (!logger) table->data = "NONE"; else diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c index 74aebed5bd2..5ab22e2bbd7 100644 --- a/net/netfilter/nf_queue.c +++ b/net/netfilter/nf_queue.c @@ -27,14 +27,17 @@ static DEFINE_MUTEX(queue_handler_mutex); int nf_register_queue_handler(u_int8_t pf, const struct nf_queue_handler *qh) { int ret; + const struct nf_queue_handler *old; if (pf >= ARRAY_SIZE(queue_handler)) return -EINVAL; mutex_lock(&queue_handler_mutex); - if (queue_handler[pf] == qh) + old = rcu_dereference_protected(queue_handler[pf], + lockdep_is_held(&queue_handler_mutex)); + if (old == qh) ret = -EEXIST; - else if (queue_handler[pf]) + else if (old) ret = -EBUSY; else { rcu_assign_pointer(queue_handler[pf], qh); @@ -49,11 +52,15 @@ EXPORT_SYMBOL(nf_register_queue_handler); /* The caller must flush their queue before this */ int nf_unregister_queue_handler(u_int8_t pf, const struct nf_queue_handler *qh) { + const struct nf_queue_handler *old; + if (pf >= ARRAY_SIZE(queue_handler)) return -EINVAL; mutex_lock(&queue_handler_mutex); - if (queue_handler[pf] && queue_handler[pf] != qh) { + old = rcu_dereference_protected(queue_handler[pf], + lockdep_is_held(&queue_handler_mutex)); + if (old && old != qh) { mutex_unlock(&queue_handler_mutex); return -EINVAL; } @@ -73,7 +80,10 @@ void nf_unregister_queue_handlers(const struct nf_queue_handler *qh) mutex_lock(&queue_handler_mutex); for (pf = 0; pf < ARRAY_SIZE(queue_handler); pf++) { - if (queue_handler[pf] == qh) + if (rcu_dereference_protected( + queue_handler[pf], + lockdep_is_held(&queue_handler_mutex) + ) == qh) rcu_assign_pointer(queue_handler[pf], NULL); } mutex_unlock(&queue_handler_mutex); @@ -115,7 +125,7 @@ static int __nf_queue(struct sk_buff *skb, int (*okfn)(struct sk_buff *), unsigned int queuenum) { - int status; + int status = -ENOENT; struct nf_queue_entry *entry = NULL; #ifdef CONFIG_BRIDGE_NETFILTER struct net_device *physindev; @@ -128,16 +138,20 @@ static int __nf_queue(struct sk_buff *skb, rcu_read_lock(); qh = rcu_dereference(queue_handler[pf]); - if (!qh) + if (!qh) { + status = -ESRCH; goto err_unlock; + } afinfo = nf_get_afinfo(pf); if (!afinfo) goto err_unlock; entry = kmalloc(sizeof(*entry) + afinfo->route_key_size, GFP_ATOMIC); - if (!entry) + if (!entry) { + status = -ENOMEM; goto err_unlock; + } *entry = (struct nf_queue_entry) { .skb = skb, @@ -151,11 +165,9 @@ static int __nf_queue(struct sk_buff *skb, /* If it's going away, ignore hook. */ if (!try_module_get(entry->elem->owner)) { - rcu_read_unlock(); - kfree(entry); - return 0; + status = -ECANCELED; + goto err_unlock; } - /* Bump dev refs so they don't vanish while packet is out */ if (indev) dev_hold(indev); @@ -182,14 +194,13 @@ static int __nf_queue(struct sk_buff *skb, goto err; } - return 1; + return 0; err_unlock: rcu_read_unlock(); err: - kfree_skb(skb); kfree(entry); - return 1; + return status; } int nf_queue(struct sk_buff *skb, @@ -201,6 +212,8 @@ int nf_queue(struct sk_buff *skb, unsigned int queuenum) { struct sk_buff *segs; + int err; + unsigned int queued; if (!skb_is_gso(skb)) return __nf_queue(skb, elem, pf, hook, indev, outdev, okfn, @@ -216,20 +229,35 @@ int nf_queue(struct sk_buff *skb, } segs = skb_gso_segment(skb, 0); - kfree_skb(skb); + /* Does not use PTR_ERR to limit the number of error codes that can be + * returned by nf_queue. For instance, callers rely on -ECANCELED to mean + * 'ignore this hook'. + */ if (IS_ERR(segs)) - return 1; + return -EINVAL; + queued = 0; + err = 0; do { struct sk_buff *nskb = segs->next; segs->next = NULL; - if (!__nf_queue(segs, elem, pf, hook, indev, outdev, okfn, - queuenum)) + if (err == 0) + err = __nf_queue(segs, elem, pf, hook, indev, + outdev, okfn, queuenum); + if (err == 0) + queued++; + else kfree_skb(segs); segs = nskb; } while (segs); - return 1; + + /* also free orig skb if only some segments were queued */ + if (unlikely(err && queued)) + err = 0; + if (err == 0) + kfree_skb(skb); + return err; } void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict) @@ -237,6 +265,7 @@ void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict) struct sk_buff *skb = entry->skb; struct list_head *elem = &entry->elem->list; const struct nf_afinfo *afinfo; + int err; rcu_read_lock(); @@ -270,10 +299,17 @@ void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict) local_bh_enable(); break; case NF_QUEUE: - if (!__nf_queue(skb, elem, entry->pf, entry->hook, - entry->indev, entry->outdev, entry->okfn, - verdict >> NF_VERDICT_BITS)) - goto next_hook; + err = __nf_queue(skb, elem, entry->pf, entry->hook, + entry->indev, entry->outdev, entry->okfn, + verdict >> NF_VERDICT_QBITS); + if (err < 0) { + if (err == -ECANCELED) + goto next_hook; + if (err == -ESRCH && + (verdict & NF_VERDICT_FLAG_QUEUE_BYPASS)) + goto next_hook; + kfree_skb(skb); + } break; case NF_STOLEN: default: diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index 6a1572b0ab4..91592da504b 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -874,19 +874,19 @@ static struct hlist_node *get_first(struct iter_state *st) for (st->bucket = 0; st->bucket < INSTANCE_BUCKETS; st->bucket++) { if (!hlist_empty(&instance_table[st->bucket])) - return rcu_dereference_bh(instance_table[st->bucket].first); + return rcu_dereference_bh(hlist_first_rcu(&instance_table[st->bucket])); } return NULL; } static struct hlist_node *get_next(struct iter_state *st, struct hlist_node *h) { - h = rcu_dereference_bh(h->next); + h = rcu_dereference_bh(hlist_next_rcu(h)); while (!h) { if (++st->bucket >= INSTANCE_BUCKETS) return NULL; - h = rcu_dereference_bh(instance_table[st->bucket].first); + h = rcu_dereference_bh(hlist_first_rcu(&instance_table[st->bucket])); } return h; } diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index 68e67d19724..b83123f12b4 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -387,25 +387,31 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum) { struct sk_buff *nskb; struct nfqnl_instance *queue; - int err; + int err = -ENOBUFS; /* rcu_read_lock()ed by nf_hook_slow() */ queue = instance_lookup(queuenum); - if (!queue) + if (!queue) { + err = -ESRCH; goto err_out; + } - if (queue->copy_mode == NFQNL_COPY_NONE) + if (queue->copy_mode == NFQNL_COPY_NONE) { + err = -EINVAL; goto err_out; + } nskb = nfqnl_build_packet_message(queue, entry); - if (nskb == NULL) + if (nskb == NULL) { + err = -ENOMEM; goto err_out; - + } spin_lock_bh(&queue->lock); - if (!queue->peer_pid) + if (!queue->peer_pid) { + err = -EINVAL; goto err_out_free_nskb; - + } if (queue->queue_total >= queue->queue_maxlen) { queue->queue_dropped++; if (net_ratelimit()) @@ -432,7 +438,7 @@ err_out_free_nskb: err_out_unlock: spin_unlock_bh(&queue->lock); err_out: - return -1; + return err; } static int diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index c9423763107..0a77d2ff215 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -23,6 +23,7 @@ #include <linux/mutex.h> #include <linux/mm.h> #include <linux/slab.h> +#include <linux/audit.h> #include <net/net_namespace.h> #include <linux/netfilter/x_tables.h> @@ -38,9 +39,8 @@ MODULE_DESCRIPTION("{ip,ip6,arp,eb}_tables backend module"); #define SMP_ALIGN(x) (((x) + SMP_CACHE_BYTES-1) & ~(SMP_CACHE_BYTES-1)) struct compat_delta { - struct compat_delta *next; - unsigned int offset; - int delta; + unsigned int offset; /* offset in kernel */ + int delta; /* delta in 32bit user land */ }; struct xt_af { @@ -49,7 +49,9 @@ struct xt_af { struct list_head target; #ifdef CONFIG_COMPAT struct mutex compat_mutex; - struct compat_delta *compat_offsets; + struct compat_delta *compat_tab; + unsigned int number; /* number of slots in compat_tab[] */ + unsigned int cur; /* number of used slots in compat_tab[] */ #endif }; @@ -414,54 +416,67 @@ int xt_check_match(struct xt_mtchk_param *par, EXPORT_SYMBOL_GPL(xt_check_match); #ifdef CONFIG_COMPAT -int xt_compat_add_offset(u_int8_t af, unsigned int offset, short delta) +int xt_compat_add_offset(u_int8_t af, unsigned int offset, int delta) { - struct compat_delta *tmp; + struct xt_af *xp = &xt[af]; - tmp = kmalloc(sizeof(struct compat_delta), GFP_KERNEL); - if (!tmp) - return -ENOMEM; + if (!xp->compat_tab) { + if (!xp->number) + return -EINVAL; + xp->compat_tab = vmalloc(sizeof(struct compat_delta) * xp->number); + if (!xp->compat_tab) + return -ENOMEM; + xp->cur = 0; + } - tmp->offset = offset; - tmp->delta = delta; + if (xp->cur >= xp->number) + return -EINVAL; - if (xt[af].compat_offsets) { - tmp->next = xt[af].compat_offsets->next; - xt[af].compat_offsets->next = tmp; - } else { - xt[af].compat_offsets = tmp; - tmp->next = NULL; - } + if (xp->cur) + delta += xp->compat_tab[xp->cur - 1].delta; + xp->compat_tab[xp->cur].offset = offset; + xp->compat_tab[xp->cur].delta = delta; + xp->cur++; return 0; } EXPORT_SYMBOL_GPL(xt_compat_add_offset); void xt_compat_flush_offsets(u_int8_t af) { - struct compat_delta *tmp, *next; - - if (xt[af].compat_offsets) { - for (tmp = xt[af].compat_offsets; tmp; tmp = next) { - next = tmp->next; - kfree(tmp); - } - xt[af].compat_offsets = NULL; + if (xt[af].compat_tab) { + vfree(xt[af].compat_tab); + xt[af].compat_tab = NULL; + xt[af].number = 0; } } EXPORT_SYMBOL_GPL(xt_compat_flush_offsets); int xt_compat_calc_jump(u_int8_t af, unsigned int offset) { - struct compat_delta *tmp; - int delta; - - for (tmp = xt[af].compat_offsets, delta = 0; tmp; tmp = tmp->next) - if (tmp->offset < offset) - delta += tmp->delta; - return delta; + struct compat_delta *tmp = xt[af].compat_tab; + int mid, left = 0, right = xt[af].cur - 1; + + while (left <= right) { + mid = (left + right) >> 1; + if (offset > tmp[mid].offset) + left = mid + 1; + else if (offset < tmp[mid].offset) + right = mid - 1; + else + return mid ? tmp[mid - 1].delta : 0; + } + WARN_ON_ONCE(1); + return 0; } EXPORT_SYMBOL_GPL(xt_compat_calc_jump); +void xt_compat_init_offsets(u_int8_t af, unsigned int number) +{ + xt[af].number = number; + xt[af].cur = 0; +} +EXPORT_SYMBOL(xt_compat_init_offsets); + int xt_compat_match_offset(const struct xt_match *match) { u_int16_t csize = match->compatsize ? : match->matchsize; @@ -820,6 +835,21 @@ xt_replace_table(struct xt_table *table, */ local_bh_enable(); +#ifdef CONFIG_AUDIT + if (audit_enabled) { + struct audit_buffer *ab; + + ab = audit_log_start(current->audit_context, GFP_KERNEL, + AUDIT_NETFILTER_CFG); + if (ab) { + audit_log_format(ab, "table=%s family=%u entries=%u", + table->name, table->af, + private->number); + audit_log_end(ab); + } + } +#endif + return private; } EXPORT_SYMBOL_GPL(xt_replace_table); @@ -1338,7 +1368,7 @@ static int __init xt_init(void) mutex_init(&xt[i].mutex); #ifdef CONFIG_COMPAT mutex_init(&xt[i].compat_mutex); - xt[i].compat_offsets = NULL; + xt[i].compat_tab = NULL; #endif INIT_LIST_HEAD(&xt[i].target); INIT_LIST_HEAD(&xt[i].match); diff --git a/net/netfilter/xt_AUDIT.c b/net/netfilter/xt_AUDIT.c new file mode 100644 index 00000000000..81802d27346 --- /dev/null +++ b/net/netfilter/xt_AUDIT.c @@ -0,0 +1,204 @@ +/* + * Creates audit record for dropped/accepted packets + * + * (C) 2010-2011 Thomas Graf <tgraf@redhat.com> + * (C) 2010-2011 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. +*/ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/audit.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/tcp.h> +#include <linux/udp.h> +#include <linux/if_arp.h> +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_AUDIT.h> +#include <net/ipv6.h> +#include <net/ip.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Thomas Graf <tgraf@redhat.com>"); +MODULE_DESCRIPTION("Xtables: creates audit records for dropped/accepted packets"); +MODULE_ALIAS("ipt_AUDIT"); +MODULE_ALIAS("ip6t_AUDIT"); +MODULE_ALIAS("ebt_AUDIT"); +MODULE_ALIAS("arpt_AUDIT"); + +static void audit_proto(struct audit_buffer *ab, struct sk_buff *skb, + unsigned int proto, unsigned int offset) +{ + switch (proto) { + case IPPROTO_TCP: + case IPPROTO_UDP: + case IPPROTO_UDPLITE: { + const __be16 *pptr; + __be16 _ports[2]; + + pptr = skb_header_pointer(skb, offset, sizeof(_ports), _ports); + if (pptr == NULL) { + audit_log_format(ab, " truncated=1"); + return; + } + + audit_log_format(ab, " sport=%hu dport=%hu", + ntohs(pptr[0]), ntohs(pptr[1])); + } + break; + + case IPPROTO_ICMP: + case IPPROTO_ICMPV6: { + const u8 *iptr; + u8 _ih[2]; + + iptr = skb_header_pointer(skb, offset, sizeof(_ih), &_ih); + if (iptr == NULL) { + audit_log_format(ab, " truncated=1"); + return; + } + + audit_log_format(ab, " icmptype=%hhu icmpcode=%hhu", + iptr[0], iptr[1]); + + } + break; + } +} + +static void audit_ip4(struct audit_buffer *ab, struct sk_buff *skb) +{ + struct iphdr _iph; + const struct iphdr *ih; + + ih = skb_header_pointer(skb, 0, sizeof(_iph), &_iph); + if (!ih) { + audit_log_format(ab, " truncated=1"); + return; + } + + audit_log_format(ab, " saddr=%pI4 daddr=%pI4 ipid=%hu proto=%hhu", + &ih->saddr, &ih->daddr, ntohs(ih->id), ih->protocol); + + if (ntohs(ih->frag_off) & IP_OFFSET) { + audit_log_format(ab, " frag=1"); + return; + } + + audit_proto(ab, skb, ih->protocol, ih->ihl * 4); +} + +static void audit_ip6(struct audit_buffer *ab, struct sk_buff *skb) +{ + struct ipv6hdr _ip6h; + const struct ipv6hdr *ih; + u8 nexthdr; + int offset; + + ih = skb_header_pointer(skb, skb_network_offset(skb), sizeof(_ip6h), &_ip6h); + if (!ih) { + audit_log_format(ab, " truncated=1"); + return; + } + + nexthdr = ih->nexthdr; + offset = ipv6_skip_exthdr(skb, skb_network_offset(skb) + sizeof(_ip6h), + &nexthdr); + + audit_log_format(ab, " saddr=%pI6c daddr=%pI6c proto=%hhu", + &ih->saddr, &ih->daddr, nexthdr); + + if (offset) + audit_proto(ab, skb, nexthdr, offset); +} + +static unsigned int +audit_tg(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct xt_audit_info *info = par->targinfo; + struct audit_buffer *ab; + + ab = audit_log_start(NULL, GFP_ATOMIC, AUDIT_NETFILTER_PKT); + if (ab == NULL) + goto errout; + + audit_log_format(ab, "action=%hhu hook=%u len=%u inif=%s outif=%s", + info->type, par->hooknum, skb->len, + par->in ? par->in->name : "?", + par->out ? par->out->name : "?"); + + if (skb->mark) + audit_log_format(ab, " mark=%#x", skb->mark); + + if (skb->dev && skb->dev->type == ARPHRD_ETHER) { + audit_log_format(ab, " smac=%pM dmac=%pM macproto=0x%04x", + eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest, + ntohs(eth_hdr(skb)->h_proto)); + + if (par->family == NFPROTO_BRIDGE) { + switch (eth_hdr(skb)->h_proto) { + case __constant_htons(ETH_P_IP): + audit_ip4(ab, skb); + break; + + case __constant_htons(ETH_P_IPV6): + audit_ip6(ab, skb); + break; + } + } + } + + switch (par->family) { + case NFPROTO_IPV4: + audit_ip4(ab, skb); + break; + + case NFPROTO_IPV6: + audit_ip6(ab, skb); + break; + } + + audit_log_end(ab); + +errout: + return XT_CONTINUE; +} + +static int audit_tg_check(const struct xt_tgchk_param *par) +{ + const struct xt_audit_info *info = par->targinfo; + + if (info->type > XT_AUDIT_TYPE_MAX) { + pr_info("Audit type out of range (valid range: 0..%hhu)\n", + XT_AUDIT_TYPE_MAX); + return -ERANGE; + } + + return 0; +} + +static struct xt_target audit_tg_reg __read_mostly = { + .name = "AUDIT", + .family = NFPROTO_UNSPEC, + .target = audit_tg, + .targetsize = sizeof(struct xt_audit_info), + .checkentry = audit_tg_check, + .me = THIS_MODULE, +}; + +static int __init audit_tg_init(void) +{ + return xt_register_target(&audit_tg_reg); +} + +static void __exit audit_tg_exit(void) +{ + xt_unregister_target(&audit_tg_reg); +} + +module_init(audit_tg_init); +module_exit(audit_tg_exit); diff --git a/net/netfilter/xt_CLASSIFY.c b/net/netfilter/xt_CLASSIFY.c index c2c0e4abeb9..af9c4dadf81 100644 --- a/net/netfilter/xt_CLASSIFY.c +++ b/net/netfilter/xt_CLASSIFY.c @@ -19,12 +19,14 @@ #include <linux/netfilter_ipv6.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_CLASSIFY.h> +#include <linux/netfilter_arp.h> MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Xtables: Qdisc classification"); MODULE_ALIAS("ipt_CLASSIFY"); MODULE_ALIAS("ip6t_CLASSIFY"); +MODULE_ALIAS("arpt_CLASSIFY"); static unsigned int classify_tg(struct sk_buff *skb, const struct xt_action_param *par) @@ -35,26 +37,36 @@ classify_tg(struct sk_buff *skb, const struct xt_action_param *par) return XT_CONTINUE; } -static struct xt_target classify_tg_reg __read_mostly = { - .name = "CLASSIFY", - .revision = 0, - .family = NFPROTO_UNSPEC, - .table = "mangle", - .hooks = (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_FORWARD) | - (1 << NF_INET_POST_ROUTING), - .target = classify_tg, - .targetsize = sizeof(struct xt_classify_target_info), - .me = THIS_MODULE, +static struct xt_target classify_tg_reg[] __read_mostly = { + { + .name = "CLASSIFY", + .revision = 0, + .family = NFPROTO_UNSPEC, + .hooks = (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_FORWARD) | + (1 << NF_INET_POST_ROUTING), + .target = classify_tg, + .targetsize = sizeof(struct xt_classify_target_info), + .me = THIS_MODULE, + }, + { + .name = "CLASSIFY", + .revision = 0, + .family = NFPROTO_ARP, + .hooks = (1 << NF_ARP_OUT) | (1 << NF_ARP_FORWARD), + .target = classify_tg, + .targetsize = sizeof(struct xt_classify_target_info), + .me = THIS_MODULE, + }, }; static int __init classify_tg_init(void) { - return xt_register_target(&classify_tg_reg); + return xt_register_targets(classify_tg_reg, ARRAY_SIZE(classify_tg_reg)); } static void __exit classify_tg_exit(void) { - xt_unregister_target(&classify_tg_reg); + xt_unregister_targets(classify_tg_reg, ARRAY_SIZE(classify_tg_reg)); } module_init(classify_tg_init); diff --git a/net/netfilter/xt_IDLETIMER.c b/net/netfilter/xt_IDLETIMER.c index be1f22e1354..3bdd443aaf1 100644 --- a/net/netfilter/xt_IDLETIMER.c +++ b/net/netfilter/xt_IDLETIMER.c @@ -313,3 +313,5 @@ MODULE_AUTHOR("Timo Teras <ext-timo.teras@nokia.com>"); MODULE_AUTHOR("Luciano Coelho <luciano.coelho@nokia.com>"); MODULE_DESCRIPTION("Xtables: idle time monitor"); MODULE_LICENSE("GPL v2"); +MODULE_ALIAS("ipt_IDLETIMER"); +MODULE_ALIAS("ip6t_IDLETIMER"); diff --git a/net/netfilter/xt_LED.c b/net/netfilter/xt_LED.c index a4140509eea..993de2ba89d 100644 --- a/net/netfilter/xt_LED.c +++ b/net/netfilter/xt_LED.c @@ -31,6 +31,8 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Adam Nielsen <a.nielsen@shikadi.net>"); MODULE_DESCRIPTION("Xtables: trigger LED devices on packet match"); +MODULE_ALIAS("ipt_LED"); +MODULE_ALIAS("ip6t_LED"); static LIST_HEAD(xt_led_triggers); static DEFINE_MUTEX(xt_led_mutex); diff --git a/net/netfilter/xt_NFQUEUE.c b/net/netfilter/xt_NFQUEUE.c index 039cce1bde3..d4f4b5d66b2 100644 --- a/net/netfilter/xt_NFQUEUE.c +++ b/net/netfilter/xt_NFQUEUE.c @@ -72,18 +72,31 @@ nfqueue_tg_v1(struct sk_buff *skb, const struct xt_action_param *par) if (info->queues_total > 1) { if (par->family == NFPROTO_IPV4) - queue = hash_v4(skb) % info->queues_total + queue; + queue = (((u64) hash_v4(skb) * info->queues_total) >> + 32) + queue; #if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE) else if (par->family == NFPROTO_IPV6) - queue = hash_v6(skb) % info->queues_total + queue; + queue = (((u64) hash_v6(skb) * info->queues_total) >> + 32) + queue; #endif } return NF_QUEUE_NR(queue); } -static int nfqueue_tg_v1_check(const struct xt_tgchk_param *par) +static unsigned int +nfqueue_tg_v2(struct sk_buff *skb, const struct xt_action_param *par) { - const struct xt_NFQ_info_v1 *info = par->targinfo; + const struct xt_NFQ_info_v2 *info = par->targinfo; + unsigned int ret = nfqueue_tg_v1(skb, par); + + if (info->bypass) + ret |= NF_VERDICT_FLAG_QUEUE_BYPASS; + return ret; +} + +static int nfqueue_tg_check(const struct xt_tgchk_param *par) +{ + const struct xt_NFQ_info_v2 *info = par->targinfo; u32 maxid; if (unlikely(!rnd_inited)) { @@ -100,6 +113,8 @@ static int nfqueue_tg_v1_check(const struct xt_tgchk_param *par) info->queues_total, maxid); return -ERANGE; } + if (par->target->revision == 2 && info->bypass > 1) + return -EINVAL; return 0; } @@ -115,11 +130,20 @@ static struct xt_target nfqueue_tg_reg[] __read_mostly = { .name = "NFQUEUE", .revision = 1, .family = NFPROTO_UNSPEC, - .checkentry = nfqueue_tg_v1_check, + .checkentry = nfqueue_tg_check, .target = nfqueue_tg_v1, .targetsize = sizeof(struct xt_NFQ_info_v1), .me = THIS_MODULE, }, + { + .name = "NFQUEUE", + .revision = 2, + .family = NFPROTO_UNSPEC, + .checkentry = nfqueue_tg_check, + .target = nfqueue_tg_v2, + .targetsize = sizeof(struct xt_NFQ_info_v2), + .me = THIS_MODULE, + }, }; static int __init nfqueue_tg_init(void) diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c index 5c5b6b921b8..7fd3fd51f27 100644 --- a/net/netfilter/xt_connlimit.c +++ b/net/netfilter/xt_connlimit.c @@ -193,10 +193,12 @@ connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par) if (par->family == NFPROTO_IPV6) { const struct ipv6hdr *iph = ipv6_hdr(skb); - memcpy(&addr.ip6, &iph->saddr, sizeof(iph->saddr)); + memcpy(&addr.ip6, (info->flags & XT_CONNLIMIT_DADDR) ? + &iph->daddr : &iph->saddr, sizeof(addr.ip6)); } else { const struct iphdr *iph = ip_hdr(skb); - addr.ip = iph->saddr; + addr.ip = (info->flags & XT_CONNLIMIT_DADDR) ? + iph->daddr : iph->saddr; } spin_lock_bh(&info->data->lock); @@ -204,13 +206,12 @@ connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par) &info->mask, par->family); spin_unlock_bh(&info->data->lock); - if (connections < 0) { + if (connections < 0) /* kmalloc failed, drop it entirely */ - par->hotdrop = true; - return false; - } + goto hotdrop; - return (connections > info->limit) ^ info->inverse; + return (connections > info->limit) ^ + !!(info->flags & XT_CONNLIMIT_INVERT); hotdrop: par->hotdrop = true; @@ -268,25 +269,38 @@ static void connlimit_mt_destroy(const struct xt_mtdtor_param *par) kfree(info->data); } -static struct xt_match connlimit_mt_reg __read_mostly = { - .name = "connlimit", - .revision = 0, - .family = NFPROTO_UNSPEC, - .checkentry = connlimit_mt_check, - .match = connlimit_mt, - .matchsize = sizeof(struct xt_connlimit_info), - .destroy = connlimit_mt_destroy, - .me = THIS_MODULE, +static struct xt_match connlimit_mt_reg[] __read_mostly = { + { + .name = "connlimit", + .revision = 0, + .family = NFPROTO_UNSPEC, + .checkentry = connlimit_mt_check, + .match = connlimit_mt, + .matchsize = sizeof(struct xt_connlimit_info), + .destroy = connlimit_mt_destroy, + .me = THIS_MODULE, + }, + { + .name = "connlimit", + .revision = 1, + .family = NFPROTO_UNSPEC, + .checkentry = connlimit_mt_check, + .match = connlimit_mt, + .matchsize = sizeof(struct xt_connlimit_info), + .destroy = connlimit_mt_destroy, + .me = THIS_MODULE, + }, }; static int __init connlimit_mt_init(void) { - return xt_register_match(&connlimit_mt_reg); + return xt_register_matches(connlimit_mt_reg, + ARRAY_SIZE(connlimit_mt_reg)); } static void __exit connlimit_mt_exit(void) { - xt_unregister_match(&connlimit_mt_reg); + xt_unregister_matches(connlimit_mt_reg, ARRAY_SIZE(connlimit_mt_reg)); } module_init(connlimit_mt_init); diff --git a/net/netfilter/xt_conntrack.c b/net/netfilter/xt_conntrack.c index e536710ad91..4ef1b63ad73 100644 --- a/net/netfilter/xt_conntrack.c +++ b/net/netfilter/xt_conntrack.c @@ -112,6 +112,54 @@ ct_proto_port_check(const struct xt_conntrack_mtinfo2 *info, return true; } +static inline bool +port_match(u16 min, u16 max, u16 port, bool invert) +{ + return (port >= min && port <= max) ^ invert; +} + +static inline bool +ct_proto_port_check_v3(const struct xt_conntrack_mtinfo3 *info, + const struct nf_conn *ct) +{ + const struct nf_conntrack_tuple *tuple; + + tuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; + if ((info->match_flags & XT_CONNTRACK_PROTO) && + (nf_ct_protonum(ct) == info->l4proto) ^ + !(info->invert_flags & XT_CONNTRACK_PROTO)) + return false; + + /* Shortcut to match all recognized protocols by using ->src.all. */ + if ((info->match_flags & XT_CONNTRACK_ORIGSRC_PORT) && + !port_match(info->origsrc_port, info->origsrc_port_high, + ntohs(tuple->src.u.all), + info->invert_flags & XT_CONNTRACK_ORIGSRC_PORT)) + return false; + + if ((info->match_flags & XT_CONNTRACK_ORIGDST_PORT) && + !port_match(info->origdst_port, info->origdst_port_high, + ntohs(tuple->dst.u.all), + info->invert_flags & XT_CONNTRACK_ORIGDST_PORT)) + return false; + + tuple = &ct->tuplehash[IP_CT_DIR_REPLY].tuple; + + if ((info->match_flags & XT_CONNTRACK_REPLSRC_PORT) && + !port_match(info->replsrc_port, info->replsrc_port_high, + ntohs(tuple->src.u.all), + info->invert_flags & XT_CONNTRACK_REPLSRC_PORT)) + return false; + + if ((info->match_flags & XT_CONNTRACK_REPLDST_PORT) && + !port_match(info->repldst_port, info->repldst_port_high, + ntohs(tuple->dst.u.all), + info->invert_flags & XT_CONNTRACK_REPLDST_PORT)) + return false; + + return true; +} + static bool conntrack_mt(const struct sk_buff *skb, struct xt_action_param *par, u16 state_mask, u16 status_mask) @@ -170,8 +218,13 @@ conntrack_mt(const struct sk_buff *skb, struct xt_action_param *par, !(info->invert_flags & XT_CONNTRACK_REPLDST)) return false; - if (!ct_proto_port_check(info, ct)) - return false; + if (par->match->revision != 3) { + if (!ct_proto_port_check(info, ct)) + return false; + } else { + if (!ct_proto_port_check_v3(par->matchinfo, ct)) + return false; + } if ((info->match_flags & XT_CONNTRACK_STATUS) && (!!(status_mask & ct->status) ^ @@ -207,6 +260,14 @@ conntrack_mt_v2(const struct sk_buff *skb, struct xt_action_param *par) return conntrack_mt(skb, par, info->state_mask, info->status_mask); } +static bool +conntrack_mt_v3(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_conntrack_mtinfo3 *info = par->matchinfo; + + return conntrack_mt(skb, par, info->state_mask, info->status_mask); +} + static int conntrack_mt_check(const struct xt_mtchk_param *par) { int ret; @@ -244,6 +305,16 @@ static struct xt_match conntrack_mt_reg[] __read_mostly = { .destroy = conntrack_mt_destroy, .me = THIS_MODULE, }, + { + .name = "conntrack", + .revision = 3, + .family = NFPROTO_UNSPEC, + .matchsize = sizeof(struct xt_conntrack_mtinfo3), + .match = conntrack_mt_v3, + .checkentry = conntrack_mt_check, + .destroy = conntrack_mt_destroy, + .me = THIS_MODULE, + }, }; static int __init conntrack_mt_init(void) diff --git a/net/netfilter/xt_cpu.c b/net/netfilter/xt_cpu.c index b39db8a5cba..c7a2e5466bc 100644 --- a/net/netfilter/xt_cpu.c +++ b/net/netfilter/xt_cpu.c @@ -22,6 +22,8 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Eric Dumazet <eric.dumazet@gmail.com>"); MODULE_DESCRIPTION("Xtables: CPU match"); +MODULE_ALIAS("ipt_cpu"); +MODULE_ALIAS("ip6t_cpu"); static int cpu_mt_check(const struct xt_mtchk_param *par) { diff --git a/net/netfilter/xt_ipvs.c b/net/netfilter/xt_ipvs.c index 9127a3d8aa3..bb10b0717f1 100644 --- a/net/netfilter/xt_ipvs.c +++ b/net/netfilter/xt_ipvs.c @@ -85,7 +85,7 @@ ipvs_mt(const struct sk_buff *skb, struct xt_action_param *par) /* * Check if the packet belongs to an existing entry */ - cp = pp->conn_out_get(family, skb, pp, &iph, iph.len, 1 /* inverse */); + cp = pp->conn_out_get(family, skb, &iph, iph.len, 1 /* inverse */); if (unlikely(cp == NULL)) { match = false; goto out; diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 91cb1d71f01..c60649ec119 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -164,7 +164,6 @@ struct packet_mreq_max { static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing, int tx_ring); -#define PGV_FROM_VMALLOC 1 struct pgv { char *buffer; }; @@ -523,11 +522,11 @@ static inline unsigned int run_filter(const struct sk_buff *skb, { struct sk_filter *filter; - rcu_read_lock_bh(); - filter = rcu_dereference_bh(sk->sk_filter); + rcu_read_lock(); + filter = rcu_dereference(sk->sk_filter); if (filter != NULL) res = sk_run_filter(skb, filter->insns); - rcu_read_unlock_bh(); + rcu_read_unlock(); return res; } diff --git a/net/rds/rds.h b/net/rds/rds.h index 9542449c072..da8adac2bf0 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -50,7 +50,6 @@ rdsdebug(char *fmt, ...) #define RDS_FRAG_SIZE ((unsigned int)(1 << RDS_FRAG_SHIFT)) #define RDS_CONG_MAP_BYTES (65536 / 8) -#define RDS_CONG_MAP_LONGS (RDS_CONG_MAP_BYTES / sizeof(unsigned long)) #define RDS_CONG_MAP_PAGES (PAGE_ALIGN(RDS_CONG_MAP_BYTES) / PAGE_SIZE) #define RDS_CONG_MAP_PAGE_BITS (PAGE_SIZE * 8) diff --git a/net/sched/Kconfig b/net/sched/Kconfig index f04d4a484d5..e318f458713 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -205,6 +205,18 @@ config NET_SCH_DRR If unsure, say N. +config NET_SCH_MQPRIO + tristate "Multi-queue priority scheduler (MQPRIO)" + help + Say Y here if you want to use the Multi-queue Priority scheduler. + This scheduler allows QOS to be offloaded on NICs that have support + for offloading QOS schedulers. + + To compile this driver as a module, choose M here: the module will + be called sch_mqprio. + + If unsure, say N. + config NET_SCH_INGRESS tristate "Ingress Qdisc" depends on NET_CLS_ACT @@ -243,7 +255,7 @@ config NET_CLS_TCINDEX config NET_CLS_ROUTE4 tristate "Routing decision (ROUTE)" - select NET_CLS_ROUTE + select IP_ROUTE_CLASSID select NET_CLS ---help--- If you say Y here, you will be able to classify packets @@ -252,9 +264,6 @@ config NET_CLS_ROUTE4 To compile this code as a module, choose M here: the module will be called cls_route. -config NET_CLS_ROUTE - bool - config NET_CLS_FW tristate "Netfilter mark (FW)" select NET_CLS diff --git a/net/sched/Makefile b/net/sched/Makefile index 960f5dba630..26ce681a2c6 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -32,6 +32,7 @@ obj-$(CONFIG_NET_SCH_MULTIQ) += sch_multiq.o obj-$(CONFIG_NET_SCH_ATM) += sch_atm.o obj-$(CONFIG_NET_SCH_NETEM) += sch_netem.o obj-$(CONFIG_NET_SCH_DRR) += sch_drr.o +obj-$(CONFIG_NET_SCH_MQPRIO) += sch_mqprio.o obj-$(CONFIG_NET_CLS_U32) += cls_u32.o obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o obj-$(CONFIG_NET_CLS_FW) += cls_fw.o diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 23b25f89e7e..15873e14cb5 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -78,7 +78,7 @@ static int tcf_dump_walker(struct sk_buff *skb, struct netlink_callback *cb, struct tc_action *a, struct tcf_hashinfo *hinfo) { struct tcf_common *p; - int err = 0, index = -1,i = 0, s_i = 0, n_i = 0; + int err = 0, index = -1, i = 0, s_i = 0, n_i = 0; struct nlattr *nest; read_lock_bh(hinfo->lock); @@ -126,7 +126,7 @@ static int tcf_del_walker(struct sk_buff *skb, struct tc_action *a, { struct tcf_common *p, *s_p; struct nlattr *nest; - int i= 0, n_i = 0; + int i = 0, n_i = 0; nest = nla_nest_start(skb, a->order); if (nest == NULL) @@ -138,7 +138,7 @@ static int tcf_del_walker(struct sk_buff *skb, struct tc_action *a, while (p != NULL) { s_p = p->tcfc_next; if (ACT_P_DELETED == tcf_hash_release(p, 0, hinfo)) - module_put(a->ops->owner); + module_put(a->ops->owner); n_i++; p = s_p; } @@ -447,7 +447,8 @@ tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int bind, int ref) nest = nla_nest_start(skb, TCA_OPTIONS); if (nest == NULL) goto nla_put_failure; - if ((err = tcf_action_dump_old(skb, a, bind, ref)) > 0) { + err = tcf_action_dump_old(skb, a, bind, ref); + if (err > 0) { nla_nest_end(skb, nest); return err; } @@ -491,7 +492,7 @@ struct tc_action *tcf_action_init_1(struct nlattr *nla, struct nlattr *est, struct tc_action *a; struct tc_action_ops *a_o; char act_name[IFNAMSIZ]; - struct nlattr *tb[TCA_ACT_MAX+1]; + struct nlattr *tb[TCA_ACT_MAX + 1]; struct nlattr *kind; int err; @@ -549,9 +550,9 @@ struct tc_action *tcf_action_init_1(struct nlattr *nla, struct nlattr *est, goto err_free; /* module count goes up only when brand new policy is created - if it exists and is only bound to in a_o->init() then - ACT_P_CREATED is not returned (a zero is). - */ + * if it exists and is only bound to in a_o->init() then + * ACT_P_CREATED is not returned (a zero is). + */ if (err != ACT_P_CREATED) module_put(a_o->owner); a->ops = a_o; @@ -569,7 +570,7 @@ err_out: struct tc_action *tcf_action_init(struct nlattr *nla, struct nlattr *est, char *name, int ovr, int bind) { - struct nlattr *tb[TCA_ACT_MAX_PRIO+1]; + struct nlattr *tb[TCA_ACT_MAX_PRIO + 1]; struct tc_action *head = NULL, *act, *act_prev = NULL; int err; int i; @@ -697,7 +698,7 @@ act_get_notify(struct net *net, u32 pid, struct nlmsghdr *n, static struct tc_action * tcf_action_get_1(struct nlattr *nla, struct nlmsghdr *n, u32 pid) { - struct nlattr *tb[TCA_ACT_MAX+1]; + struct nlattr *tb[TCA_ACT_MAX + 1]; struct tc_action *a; int index; int err; @@ -770,7 +771,7 @@ static int tca_action_flush(struct net *net, struct nlattr *nla, struct tcamsg *t; struct netlink_callback dcb; struct nlattr *nest; - struct nlattr *tb[TCA_ACT_MAX+1]; + struct nlattr *tb[TCA_ACT_MAX + 1]; struct nlattr *kind; struct tc_action *a = create_a(0); int err = -ENOMEM; @@ -821,7 +822,8 @@ static int tca_action_flush(struct net *net, struct nlattr *nla, nlh->nlmsg_flags |= NLM_F_ROOT; module_put(a->ops->owner); kfree(a); - err = rtnetlink_send(skb, net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO); + err = rtnetlink_send(skb, net, pid, RTNLGRP_TC, + n->nlmsg_flags & NLM_F_ECHO); if (err > 0) return 0; @@ -842,14 +844,14 @@ tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n, u32 pid, int event) { int i, ret; - struct nlattr *tb[TCA_ACT_MAX_PRIO+1]; + struct nlattr *tb[TCA_ACT_MAX_PRIO + 1]; struct tc_action *head = NULL, *act, *act_prev = NULL; ret = nla_parse_nested(tb, TCA_ACT_MAX_PRIO, nla, NULL); if (ret < 0) return ret; - if (event == RTM_DELACTION && n->nlmsg_flags&NLM_F_ROOT) { + if (event == RTM_DELACTION && n->nlmsg_flags & NLM_F_ROOT) { if (tb[1] != NULL) return tca_action_flush(net, tb[1], n, pid); else @@ -892,7 +894,7 @@ tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n, /* now do the delete */ tcf_action_destroy(head, 0); ret = rtnetlink_send(skb, net, pid, RTNLGRP_TC, - n->nlmsg_flags&NLM_F_ECHO); + n->nlmsg_flags & NLM_F_ECHO); if (ret > 0) return 0; return ret; @@ -936,7 +938,7 @@ static int tcf_add_notify(struct net *net, struct tc_action *a, nlh->nlmsg_len = skb_tail_pointer(skb) - b; NETLINK_CB(skb).dst_group = RTNLGRP_TC; - err = rtnetlink_send(skb, net, pid, RTNLGRP_TC, flags&NLM_F_ECHO); + err = rtnetlink_send(skb, net, pid, RTNLGRP_TC, flags & NLM_F_ECHO); if (err > 0) err = 0; return err; @@ -967,7 +969,7 @@ tcf_action_add(struct net *net, struct nlattr *nla, struct nlmsghdr *n, /* dump then free all the actions after update; inserted policy * stays intact - * */ + */ ret = tcf_add_notify(net, act, pid, seq, RTM_NEWACTION, n->nlmsg_flags); for (a = act; a; a = act) { act = a->next; @@ -993,8 +995,7 @@ static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n, void *arg) return -EINVAL; } - /* n->nlmsg_flags&NLM_F_CREATE - * */ + /* n->nlmsg_flags & NLM_F_CREATE */ switch (n->nlmsg_type) { case RTM_NEWACTION: /* we are going to assume all other flags @@ -1003,7 +1004,7 @@ static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n, void *arg) * but since we want avoid ambiguity (eg when flags * is zero) then just set this */ - if (n->nlmsg_flags&NLM_F_REPLACE) + if (n->nlmsg_flags & NLM_F_REPLACE) ovr = 1; replay: ret = tcf_action_add(net, tca[TCA_ACT_TAB], n, pid, ovr); @@ -1028,7 +1029,7 @@ replay: static struct nlattr * find_dump_kind(const struct nlmsghdr *n) { - struct nlattr *tb1, *tb2[TCA_ACT_MAX+1]; + struct nlattr *tb1, *tb2[TCA_ACT_MAX + 1]; struct nlattr *tb[TCA_ACT_MAX_PRIO + 1]; struct nlattr *nla[TCAA_MAX + 1]; struct nlattr *kind; @@ -1071,9 +1072,8 @@ tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb) } a_o = tc_lookup_action(kind); - if (a_o == NULL) { + if (a_o == NULL) return 0; - } memset(&a, 0, sizeof(struct tc_action)); a.ops = a_o; diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index 83ddfc07e45..6cdf9abe475 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -63,7 +63,7 @@ static int tcf_csum_init(struct nlattr *nla, struct nlattr *est, if (nla == NULL) return -EINVAL; - err = nla_parse_nested(tb, TCA_CSUM_MAX, nla,csum_policy); + err = nla_parse_nested(tb, TCA_CSUM_MAX, nla, csum_policy); if (err < 0) return err; diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index c2ed90a4c0b..2b4ab4b05ce 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -50,7 +50,7 @@ static int gact_determ(struct tcf_gact *gact) } typedef int (*g_rand)(struct tcf_gact *gact); -static g_rand gact_rand[MAX_RAND]= { NULL, gact_net_rand, gact_determ }; +static g_rand gact_rand[MAX_RAND] = { NULL, gact_net_rand, gact_determ }; #endif /* CONFIG_GACT_PROB */ static const struct nla_policy gact_policy[TCA_GACT_MAX + 1] = { @@ -89,7 +89,7 @@ static int tcf_gact_init(struct nlattr *nla, struct nlattr *est, pc = tcf_hash_create(parm->index, est, a, sizeof(*gact), bind, &gact_idx_gen, &gact_hash_info); if (IS_ERR(pc)) - return PTR_ERR(pc); + return PTR_ERR(pc); ret = ACT_P_CREATED; } else { if (!ovr) { @@ -205,9 +205,9 @@ MODULE_LICENSE("GPL"); static int __init gact_init_module(void) { #ifdef CONFIG_GACT_PROB - printk(KERN_INFO "GACT probability on\n"); + pr_info("GACT probability on\n"); #else - printk(KERN_INFO "GACT probability NOT on\n"); + pr_info("GACT probability NOT on\n"); #endif return tcf_register_action(&act_gact_ops); } diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index c2a7c20e81c..9fc211a1b20 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -138,7 +138,7 @@ static int tcf_ipt_init(struct nlattr *nla, struct nlattr *est, pc = tcf_hash_create(index, est, a, sizeof(*ipt), bind, &ipt_idx_gen, &ipt_hash_info); if (IS_ERR(pc)) - return PTR_ERR(pc); + return PTR_ERR(pc); ret = ACT_P_CREATED; } else { if (!ovr) { @@ -162,7 +162,8 @@ static int tcf_ipt_init(struct nlattr *nla, struct nlattr *est, if (unlikely(!t)) goto err2; - if ((err = ipt_init_target(t, tname, hook)) < 0) + err = ipt_init_target(t, tname, hook); + if (err < 0) goto err3; spin_lock_bh(&ipt->tcf_lock); @@ -212,8 +213,9 @@ static int tcf_ipt(struct sk_buff *skb, struct tc_action *a, bstats_update(&ipt->tcf_bstats, skb); /* yes, we have to worry about both in and out dev - worry later - danger - this API seems to have changed - from earlier kernels */ + * worry later - danger - this API seems to have changed + * from earlier kernels + */ par.in = skb->dev; par.out = NULL; par.hooknum = ipt->tcfi_hook; @@ -253,9 +255,9 @@ static int tcf_ipt_dump(struct sk_buff *skb, struct tc_action *a, int bind, int struct tc_cnt c; /* for simple targets kernel size == user size - ** user name = target name - ** for foolproof you need to not assume this - */ + * user name = target name + * for foolproof you need to not assume this + */ t = kmemdup(ipt->tcfi_t, ipt->tcfi_t->u.user.target_size, GFP_ATOMIC); if (unlikely(!t)) diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index d765067e99d..961386e2f2c 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -41,13 +41,13 @@ static struct tcf_hashinfo mirred_hash_info = { .lock = &mirred_lock, }; -static inline int tcf_mirred_release(struct tcf_mirred *m, int bind) +static int tcf_mirred_release(struct tcf_mirred *m, int bind) { if (m) { if (bind) m->tcf_bindcnt--; m->tcf_refcnt--; - if(!m->tcf_bindcnt && m->tcf_refcnt <= 0) { + if (!m->tcf_bindcnt && m->tcf_refcnt <= 0) { list_del(&m->tcfm_list); if (m->tcfm_dev) dev_put(m->tcfm_dev); diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c index 178a4bd7b7c..762b027650a 100644 --- a/net/sched/act_nat.c +++ b/net/sched/act_nat.c @@ -69,7 +69,7 @@ static int tcf_nat_init(struct nlattr *nla, struct nlattr *est, pc = tcf_hash_create(parm->index, est, a, sizeof(*p), bind, &nat_idx_gen, &nat_hash_info); if (IS_ERR(pc)) - return PTR_ERR(pc); + return PTR_ERR(pc); p = to_tcf_nat(pc); ret = ACT_P_CREATED; } else { diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index 445bef716f7..50c7c06c019 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -70,7 +70,7 @@ static int tcf_pedit_init(struct nlattr *nla, struct nlattr *est, pc = tcf_hash_create(parm->index, est, a, sizeof(*p), bind, &pedit_idx_gen, &pedit_hash_info); if (IS_ERR(pc)) - return PTR_ERR(pc); + return PTR_ERR(pc); p = to_pedit(pc); keys = kmalloc(ksize, GFP_KERNEL); if (keys == NULL) { @@ -127,11 +127,9 @@ static int tcf_pedit(struct sk_buff *skb, struct tc_action *a, int i, munged = 0; unsigned int off; - if (skb_cloned(skb)) { - if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) { - return p->tcf_action; - } - } + if (skb_cloned(skb) && + pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) + return p->tcf_action; off = skb_network_offset(skb); diff --git a/net/sched/act_police.c b/net/sched/act_police.c index e2f08b1e2e5..8a1630774fd 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -22,8 +22,8 @@ #include <net/act_api.h> #include <net/netlink.h> -#define L2T(p,L) qdisc_l2t((p)->tcfp_R_tab, L) -#define L2T_P(p,L) qdisc_l2t((p)->tcfp_P_tab, L) +#define L2T(p, L) qdisc_l2t((p)->tcfp_R_tab, L) +#define L2T_P(p, L) qdisc_l2t((p)->tcfp_P_tab, L) #define POL_TAB_MASK 15 static struct tcf_common *tcf_police_ht[POL_TAB_MASK + 1]; @@ -37,8 +37,7 @@ static struct tcf_hashinfo police_hash_info = { }; /* old policer structure from before tc actions */ -struct tc_police_compat -{ +struct tc_police_compat { u32 index; int action; u32 limit; @@ -139,7 +138,7 @@ static const struct nla_policy police_policy[TCA_POLICE_MAX + 1] = { static int tcf_act_police_locate(struct nlattr *nla, struct nlattr *est, struct tc_action *a, int ovr, int bind) { - unsigned h; + unsigned int h; int ret = 0, err; struct nlattr *tb[TCA_POLICE_MAX + 1]; struct tc_police *parm; diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c index 7287cff7af3..a34a22de60b 100644 --- a/net/sched/act_simple.c +++ b/net/sched/act_simple.c @@ -47,7 +47,7 @@ static int tcf_simp(struct sk_buff *skb, struct tc_action *a, struct tcf_result /* print policy string followed by _ then packet count * Example if this was the 3rd packet and the string was "hello" * then it would look like "hello_3" (without quotes) - **/ + */ pr_info("simple: %s_%d\n", (char *)d->tcfd_defdata, d->tcf_bstats.packets); spin_unlock(&d->tcf_lock); @@ -125,7 +125,7 @@ static int tcf_simp_init(struct nlattr *nla, struct nlattr *est, pc = tcf_hash_create(parm->index, est, a, sizeof(*d), bind, &simp_idx_gen, &simp_hash_info); if (IS_ERR(pc)) - return PTR_ERR(pc); + return PTR_ERR(pc); d = to_defact(pc); ret = alloc_defdata(d, defdata); @@ -149,7 +149,7 @@ static int tcf_simp_init(struct nlattr *nla, struct nlattr *est, return ret; } -static inline int tcf_simp_cleanup(struct tc_action *a, int bind) +static int tcf_simp_cleanup(struct tc_action *a, int bind) { struct tcf_defact *d = a->priv; @@ -158,8 +158,8 @@ static inline int tcf_simp_cleanup(struct tc_action *a, int bind) return 0; } -static inline int tcf_simp_dump(struct sk_buff *skb, struct tc_action *a, - int bind, int ref) +static int tcf_simp_dump(struct sk_buff *skb, struct tc_action *a, + int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); struct tcf_defact *d = a->priv; diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index 836f5fee9e5..5f6f0c7c390 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -113,7 +113,7 @@ static int tcf_skbedit_init(struct nlattr *nla, struct nlattr *est, pc = tcf_hash_create(parm->index, est, a, sizeof(*d), bind, &skbedit_idx_gen, &skbedit_hash_info); if (IS_ERR(pc)) - return PTR_ERR(pc); + return PTR_ERR(pc); d = to_skbedit(pc); ret = ACT_P_CREATED; @@ -144,7 +144,7 @@ static int tcf_skbedit_init(struct nlattr *nla, struct nlattr *est, return ret; } -static inline int tcf_skbedit_cleanup(struct tc_action *a, int bind) +static int tcf_skbedit_cleanup(struct tc_action *a, int bind) { struct tcf_skbedit *d = a->priv; @@ -153,8 +153,8 @@ static inline int tcf_skbedit_cleanup(struct tc_action *a, int bind) return 0; } -static inline int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a, - int bind, int ref) +static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a, + int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); struct tcf_skbedit *d = a->priv; diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 5fd0c28ef79..bb2c523f815 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -85,7 +85,7 @@ int unregister_tcf_proto_ops(struct tcf_proto_ops *ops) int rc = -ENOENT; write_lock(&cls_mod_lock); - for (tp = &tcf_proto_base; (t=*tp) != NULL; tp = &t->next) + for (tp = &tcf_proto_base; (t = *tp) != NULL; tp = &t->next) if (t == ops) break; @@ -111,7 +111,7 @@ static inline u32 tcf_auto_prio(struct tcf_proto *tp) u32 first = TC_H_MAKE(0xC0000000U, 0U); if (tp) - first = tp->prio-1; + first = tp->prio - 1; return first; } @@ -149,7 +149,8 @@ replay: if (prio == 0) { /* If no priority is given, user wants we allocated it. */ - if (n->nlmsg_type != RTM_NEWTFILTER || !(n->nlmsg_flags&NLM_F_CREATE)) + if (n->nlmsg_type != RTM_NEWTFILTER || + !(n->nlmsg_flags & NLM_F_CREATE)) return -ENOENT; prio = TC_H_MAKE(0x80000000U, 0U); } @@ -176,7 +177,8 @@ replay: } /* Is it classful? */ - if ((cops = q->ops->cl_ops) == NULL) + cops = q->ops->cl_ops; + if (!cops) return -EINVAL; if (cops->tcf_chain == NULL) @@ -196,10 +198,11 @@ replay: goto errout; /* Check the chain for existence of proto-tcf with this priority */ - for (back = chain; (tp=*back) != NULL; back = &tp->next) { + for (back = chain; (tp = *back) != NULL; back = &tp->next) { if (tp->prio >= prio) { if (tp->prio == prio) { - if (!nprio || (tp->protocol != protocol && protocol)) + if (!nprio || + (tp->protocol != protocol && protocol)) goto errout; } else tp = NULL; @@ -216,7 +219,8 @@ replay: goto errout; err = -ENOENT; - if (n->nlmsg_type != RTM_NEWTFILTER || !(n->nlmsg_flags&NLM_F_CREATE)) + if (n->nlmsg_type != RTM_NEWTFILTER || + !(n->nlmsg_flags & NLM_F_CREATE)) goto errout; @@ -420,7 +424,8 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb) if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm))) return skb->len; - if ((dev = __dev_get_by_index(net, tcm->tcm_ifindex)) == NULL) + dev = __dev_get_by_index(net, tcm->tcm_ifindex); + if (!dev) return skb->len; if (!tcm->tcm_parent) @@ -429,7 +434,8 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb) q = qdisc_lookup(dev, TC_H_MAJ(tcm->tcm_parent)); if (!q) goto out; - if ((cops = q->ops->cl_ops) == NULL) + cops = q->ops->cl_ops; + if (!cops) goto errout; if (cops->tcf_chain == NULL) goto errout; @@ -444,8 +450,9 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb) s_t = cb->args[0]; - for (tp=*chain, t=0; tp; tp = tp->next, t++) { - if (t < s_t) continue; + for (tp = *chain, t = 0; tp; tp = tp->next, t++) { + if (t < s_t) + continue; if (TC_H_MAJ(tcm->tcm_info) && TC_H_MAJ(tcm->tcm_info) != tp->prio) continue; @@ -468,10 +475,10 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb) arg.skb = skb; arg.cb = cb; arg.w.stop = 0; - arg.w.skip = cb->args[1]-1; + arg.w.skip = cb->args[1] - 1; arg.w.count = 0; tp->ops->walk(tp, &arg.w); - cb->args[1] = arg.w.count+1; + cb->args[1] = arg.w.count + 1; if (arg.w.stop) break; } diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c index f23d9155b1e..8be8872dd57 100644 --- a/net/sched/cls_basic.c +++ b/net/sched/cls_basic.c @@ -21,14 +21,12 @@ #include <net/act_api.h> #include <net/pkt_cls.h> -struct basic_head -{ +struct basic_head { u32 hgenerator; struct list_head flist; }; -struct basic_filter -{ +struct basic_filter { u32 handle; struct tcf_exts exts; struct tcf_ematch_tree ematches; @@ -92,8 +90,7 @@ static int basic_init(struct tcf_proto *tp) return 0; } -static inline void basic_delete_filter(struct tcf_proto *tp, - struct basic_filter *f) +static void basic_delete_filter(struct tcf_proto *tp, struct basic_filter *f) { tcf_unbind_filter(tp, &f->res); tcf_exts_destroy(tp, &f->exts); @@ -135,9 +132,9 @@ static const struct nla_policy basic_policy[TCA_BASIC_MAX + 1] = { [TCA_BASIC_EMATCHES] = { .type = NLA_NESTED }, }; -static inline int basic_set_parms(struct tcf_proto *tp, struct basic_filter *f, - unsigned long base, struct nlattr **tb, - struct nlattr *est) +static int basic_set_parms(struct tcf_proto *tp, struct basic_filter *f, + unsigned long base, struct nlattr **tb, + struct nlattr *est) { int err = -EINVAL; struct tcf_exts e; @@ -203,7 +200,7 @@ static int basic_change(struct tcf_proto *tp, unsigned long base, u32 handle, } while (--i > 0 && basic_get(tp, head->hgenerator)); if (i <= 0) { - printk(KERN_ERR "Insufficient number of handles\n"); + pr_err("Insufficient number of handles\n"); goto errout; } diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c index d49c40fb7e0..32a335194ca 100644 --- a/net/sched/cls_cgroup.c +++ b/net/sched/cls_cgroup.c @@ -56,7 +56,8 @@ static struct cgroup_subsys_state *cgrp_create(struct cgroup_subsys *ss, { struct cgroup_cls_state *cs; - if (!(cs = kzalloc(sizeof(*cs), GFP_KERNEL))) + cs = kzalloc(sizeof(*cs), GFP_KERNEL); + if (!cs) return ERR_PTR(-ENOMEM); if (cgrp->parent) @@ -94,8 +95,7 @@ static int cgrp_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) return cgroup_add_files(cgrp, ss, ss_files, ARRAY_SIZE(ss_files)); } -struct cls_cgroup_head -{ +struct cls_cgroup_head { u32 handle; struct tcf_exts exts; struct tcf_ematch_tree ematches; @@ -166,7 +166,7 @@ static int cls_cgroup_change(struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, unsigned long *arg) { - struct nlattr *tb[TCA_CGROUP_MAX+1]; + struct nlattr *tb[TCA_CGROUP_MAX + 1]; struct cls_cgroup_head *head = tp->root; struct tcf_ematch_tree t; struct tcf_exts e; diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c index 5b271a18bc3..8ec01391d98 100644 --- a/net/sched/cls_flow.c +++ b/net/sched/cls_flow.c @@ -121,7 +121,7 @@ static u32 flow_get_proto_src(struct sk_buff *skb) if (!pskb_network_may_pull(skb, sizeof(*iph))) break; iph = ip_hdr(skb); - if (iph->frag_off & htons(IP_MF|IP_OFFSET)) + if (iph->frag_off & htons(IP_MF | IP_OFFSET)) break; poff = proto_ports_offset(iph->protocol); if (poff >= 0 && @@ -163,7 +163,7 @@ static u32 flow_get_proto_dst(struct sk_buff *skb) if (!pskb_network_may_pull(skb, sizeof(*iph))) break; iph = ip_hdr(skb); - if (iph->frag_off & htons(IP_MF|IP_OFFSET)) + if (iph->frag_off & htons(IP_MF | IP_OFFSET)) break; poff = proto_ports_offset(iph->protocol); if (poff >= 0 && @@ -276,7 +276,7 @@ fallback: static u32 flow_get_rtclassid(const struct sk_buff *skb) { -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID if (skb_dst(skb)) return skb_dst(skb)->tclassid; #endif diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c index 93b0a7b6f9b..26e7bc4ffb7 100644 --- a/net/sched/cls_fw.c +++ b/net/sched/cls_fw.c @@ -31,14 +31,12 @@ #define HTSIZE (PAGE_SIZE/sizeof(struct fw_filter *)) -struct fw_head -{ +struct fw_head { struct fw_filter *ht[HTSIZE]; u32 mask; }; -struct fw_filter -{ +struct fw_filter { struct fw_filter *next; u32 id; struct tcf_result res; @@ -53,7 +51,7 @@ static const struct tcf_ext_map fw_ext_map = { .police = TCA_FW_POLICE }; -static __inline__ int fw_hash(u32 handle) +static inline int fw_hash(u32 handle) { if (HTSIZE == 4096) return ((handle >> 24) & 0xFFF) ^ @@ -82,14 +80,14 @@ static __inline__ int fw_hash(u32 handle) static int fw_classify(struct sk_buff *skb, struct tcf_proto *tp, struct tcf_result *res) { - struct fw_head *head = (struct fw_head*)tp->root; + struct fw_head *head = (struct fw_head *)tp->root; struct fw_filter *f; int r; u32 id = skb->mark; if (head != NULL) { id &= head->mask; - for (f=head->ht[fw_hash(id)]; f; f=f->next) { + for (f = head->ht[fw_hash(id)]; f; f = f->next) { if (f->id == id) { *res = f->res; #ifdef CONFIG_NET_CLS_IND @@ -105,7 +103,8 @@ static int fw_classify(struct sk_buff *skb, struct tcf_proto *tp, } } else { /* old method */ - if (id && (TC_H_MAJ(id) == 0 || !(TC_H_MAJ(id^tp->q->handle)))) { + if (id && (TC_H_MAJ(id) == 0 || + !(TC_H_MAJ(id ^ tp->q->handle)))) { res->classid = id; res->class = 0; return 0; @@ -117,13 +116,13 @@ static int fw_classify(struct sk_buff *skb, struct tcf_proto *tp, static unsigned long fw_get(struct tcf_proto *tp, u32 handle) { - struct fw_head *head = (struct fw_head*)tp->root; + struct fw_head *head = (struct fw_head *)tp->root; struct fw_filter *f; if (head == NULL) return 0; - for (f=head->ht[fw_hash(handle)]; f; f=f->next) { + for (f = head->ht[fw_hash(handle)]; f; f = f->next) { if (f->id == handle) return (unsigned long)f; } @@ -139,8 +138,7 @@ static int fw_init(struct tcf_proto *tp) return 0; } -static inline void -fw_delete_filter(struct tcf_proto *tp, struct fw_filter *f) +static void fw_delete_filter(struct tcf_proto *tp, struct fw_filter *f) { tcf_unbind_filter(tp, &f->res); tcf_exts_destroy(tp, &f->exts); @@ -156,8 +154,8 @@ static void fw_destroy(struct tcf_proto *tp) if (head == NULL) return; - for (h=0; h<HTSIZE; h++) { - while ((f=head->ht[h]) != NULL) { + for (h = 0; h < HTSIZE; h++) { + while ((f = head->ht[h]) != NULL) { head->ht[h] = f->next; fw_delete_filter(tp, f); } @@ -167,14 +165,14 @@ static void fw_destroy(struct tcf_proto *tp) static int fw_delete(struct tcf_proto *tp, unsigned long arg) { - struct fw_head *head = (struct fw_head*)tp->root; - struct fw_filter *f = (struct fw_filter*)arg; + struct fw_head *head = (struct fw_head *)tp->root; + struct fw_filter *f = (struct fw_filter *)arg; struct fw_filter **fp; if (head == NULL || f == NULL) goto out; - for (fp=&head->ht[fw_hash(f->id)]; *fp; fp = &(*fp)->next) { + for (fp = &head->ht[fw_hash(f->id)]; *fp; fp = &(*fp)->next) { if (*fp == f) { tcf_tree_lock(tp); *fp = f->next; @@ -240,7 +238,7 @@ static int fw_change(struct tcf_proto *tp, unsigned long base, struct nlattr **tca, unsigned long *arg) { - struct fw_head *head = (struct fw_head*)tp->root; + struct fw_head *head = (struct fw_head *)tp->root; struct fw_filter *f = (struct fw_filter *) *arg; struct nlattr *opt = tca[TCA_OPTIONS]; struct nlattr *tb[TCA_FW_MAX + 1]; @@ -302,7 +300,7 @@ errout: static void fw_walk(struct tcf_proto *tp, struct tcf_walker *arg) { - struct fw_head *head = (struct fw_head*)tp->root; + struct fw_head *head = (struct fw_head *)tp->root; int h; if (head == NULL) @@ -332,7 +330,7 @@ static int fw_dump(struct tcf_proto *tp, unsigned long fh, struct sk_buff *skb, struct tcmsg *t) { struct fw_head *head = (struct fw_head *)tp->root; - struct fw_filter *f = (struct fw_filter*)fh; + struct fw_filter *f = (struct fw_filter *)fh; unsigned char *b = skb_tail_pointer(skb); struct nlattr *nest; diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c index 694dcd85dec..d580cdfca09 100644 --- a/net/sched/cls_route.c +++ b/net/sched/cls_route.c @@ -23,34 +23,30 @@ #include <net/pkt_cls.h> /* - 1. For now we assume that route tags < 256. - It allows to use direct table lookups, instead of hash tables. - 2. For now we assume that "from TAG" and "fromdev DEV" statements - are mutually exclusive. - 3. "to TAG from ANY" has higher priority, than "to ANY from XXX" + * 1. For now we assume that route tags < 256. + * It allows to use direct table lookups, instead of hash tables. + * 2. For now we assume that "from TAG" and "fromdev DEV" statements + * are mutually exclusive. + * 3. "to TAG from ANY" has higher priority, than "to ANY from XXX" */ -struct route4_fastmap -{ +struct route4_fastmap { struct route4_filter *filter; u32 id; int iif; }; -struct route4_head -{ +struct route4_head { struct route4_fastmap fastmap[16]; - struct route4_bucket *table[256+1]; + struct route4_bucket *table[256 + 1]; }; -struct route4_bucket -{ +struct route4_bucket { /* 16 FROM buckets + 16 IIF buckets + 1 wildcard bucket */ - struct route4_filter *ht[16+16+1]; + struct route4_filter *ht[16 + 16 + 1]; }; -struct route4_filter -{ +struct route4_filter { struct route4_filter *next; u32 id; int iif; @@ -61,20 +57,20 @@ struct route4_filter struct route4_bucket *bkt; }; -#define ROUTE4_FAILURE ((struct route4_filter*)(-1L)) +#define ROUTE4_FAILURE ((struct route4_filter *)(-1L)) static const struct tcf_ext_map route_ext_map = { .police = TCA_ROUTE4_POLICE, .action = TCA_ROUTE4_ACT }; -static __inline__ int route4_fastmap_hash(u32 id, int iif) +static inline int route4_fastmap_hash(u32 id, int iif) { - return id&0xF; + return id & 0xF; } -static inline -void route4_reset_fastmap(struct Qdisc *q, struct route4_head *head, u32 id) +static void +route4_reset_fastmap(struct Qdisc *q, struct route4_head *head, u32 id) { spinlock_t *root_lock = qdisc_root_sleeping_lock(q); @@ -83,32 +79,33 @@ void route4_reset_fastmap(struct Qdisc *q, struct route4_head *head, u32 id) spin_unlock_bh(root_lock); } -static inline void +static void route4_set_fastmap(struct route4_head *head, u32 id, int iif, struct route4_filter *f) { int h = route4_fastmap_hash(id, iif); + head->fastmap[h].id = id; head->fastmap[h].iif = iif; head->fastmap[h].filter = f; } -static __inline__ int route4_hash_to(u32 id) +static inline int route4_hash_to(u32 id) { - return id&0xFF; + return id & 0xFF; } -static __inline__ int route4_hash_from(u32 id) +static inline int route4_hash_from(u32 id) { - return (id>>16)&0xF; + return (id >> 16) & 0xF; } -static __inline__ int route4_hash_iif(int iif) +static inline int route4_hash_iif(int iif) { - return 16 + ((iif>>16)&0xF); + return 16 + ((iif >> 16) & 0xF); } -static __inline__ int route4_hash_wild(void) +static inline int route4_hash_wild(void) { return 32; } @@ -131,21 +128,22 @@ static __inline__ int route4_hash_wild(void) static int route4_classify(struct sk_buff *skb, struct tcf_proto *tp, struct tcf_result *res) { - struct route4_head *head = (struct route4_head*)tp->root; + struct route4_head *head = (struct route4_head *)tp->root; struct dst_entry *dst; struct route4_bucket *b; struct route4_filter *f; u32 id, h; int iif, dont_cache = 0; - if ((dst = skb_dst(skb)) == NULL) + dst = skb_dst(skb); + if (!dst) goto failure; id = dst->tclassid; if (head == NULL) goto old_method; - iif = ((struct rtable*)dst)->fl.iif; + iif = ((struct rtable *)dst)->fl.iif; h = route4_fastmap_hash(id, iif); if (id == head->fastmap[h].id && @@ -161,7 +159,8 @@ static int route4_classify(struct sk_buff *skb, struct tcf_proto *tp, h = route4_hash_to(id); restart: - if ((b = head->table[h]) != NULL) { + b = head->table[h]; + if (b) { for (f = b->ht[route4_hash_from(id)]; f; f = f->next) if (f->id == id) ROUTE4_APPLY_RESULT(); @@ -197,8 +196,9 @@ old_method: static inline u32 to_hash(u32 id) { - u32 h = id&0xFF; - if (id&0x8000) + u32 h = id & 0xFF; + + if (id & 0x8000) h += 256; return h; } @@ -211,17 +211,17 @@ static inline u32 from_hash(u32 id) if (!(id & 0x8000)) { if (id > 255) return 256; - return id&0xF; + return id & 0xF; } - return 16 + (id&0xF); + return 16 + (id & 0xF); } static unsigned long route4_get(struct tcf_proto *tp, u32 handle) { - struct route4_head *head = (struct route4_head*)tp->root; + struct route4_head *head = (struct route4_head *)tp->root; struct route4_bucket *b; struct route4_filter *f; - unsigned h1, h2; + unsigned int h1, h2; if (!head) return 0; @@ -230,11 +230,12 @@ static unsigned long route4_get(struct tcf_proto *tp, u32 handle) if (h1 > 256) return 0; - h2 = from_hash(handle>>16); + h2 = from_hash(handle >> 16); if (h2 > 32) return 0; - if ((b = head->table[h1]) != NULL) { + b = head->table[h1]; + if (b) { for (f = b->ht[h2]; f; f = f->next) if (f->handle == handle) return (unsigned long)f; @@ -251,7 +252,7 @@ static int route4_init(struct tcf_proto *tp) return 0; } -static inline void +static void route4_delete_filter(struct tcf_proto *tp, struct route4_filter *f) { tcf_unbind_filter(tp, &f->res); @@ -267,11 +268,12 @@ static void route4_destroy(struct tcf_proto *tp) if (head == NULL) return; - for (h1=0; h1<=256; h1++) { + for (h1 = 0; h1 <= 256; h1++) { struct route4_bucket *b; - if ((b = head->table[h1]) != NULL) { - for (h2=0; h2<=32; h2++) { + b = head->table[h1]; + if (b) { + for (h2 = 0; h2 <= 32; h2++) { struct route4_filter *f; while ((f = b->ht[h2]) != NULL) { @@ -287,9 +289,9 @@ static void route4_destroy(struct tcf_proto *tp) static int route4_delete(struct tcf_proto *tp, unsigned long arg) { - struct route4_head *head = (struct route4_head*)tp->root; - struct route4_filter **fp, *f = (struct route4_filter*)arg; - unsigned h = 0; + struct route4_head *head = (struct route4_head *)tp->root; + struct route4_filter **fp, *f = (struct route4_filter *)arg; + unsigned int h = 0; struct route4_bucket *b; int i; @@ -299,7 +301,7 @@ static int route4_delete(struct tcf_proto *tp, unsigned long arg) h = f->handle; b = f->bkt; - for (fp = &b->ht[from_hash(h>>16)]; *fp; fp = &(*fp)->next) { + for (fp = &b->ht[from_hash(h >> 16)]; *fp; fp = &(*fp)->next) { if (*fp == f) { tcf_tree_lock(tp); *fp = f->next; @@ -310,7 +312,7 @@ static int route4_delete(struct tcf_proto *tp, unsigned long arg) /* Strip tree */ - for (i=0; i<=32; i++) + for (i = 0; i <= 32; i++) if (b->ht[i]) return 0; @@ -380,7 +382,8 @@ static int route4_set_parms(struct tcf_proto *tp, unsigned long base, } h1 = to_hash(nhandle); - if ((b = head->table[h1]) == NULL) { + b = head->table[h1]; + if (!b) { err = -ENOBUFS; b = kzalloc(sizeof(struct route4_bucket), GFP_KERNEL); if (b == NULL) @@ -391,6 +394,7 @@ static int route4_set_parms(struct tcf_proto *tp, unsigned long base, tcf_tree_unlock(tp); } else { unsigned int h2 = from_hash(nhandle >> 16); + err = -EEXIST; for (fp = b->ht[h2]; fp; fp = fp->next) if (fp->handle == f->handle) @@ -444,7 +448,8 @@ static int route4_change(struct tcf_proto *tp, unsigned long base, if (err < 0) return err; - if ((f = (struct route4_filter*)*arg) != NULL) { + f = (struct route4_filter *)*arg; + if (f) { if (f->handle != handle && handle) return -EINVAL; @@ -481,7 +486,7 @@ static int route4_change(struct tcf_proto *tp, unsigned long base, reinsert: h = from_hash(f->handle >> 16); - for (fp = &f->bkt->ht[h]; (f1=*fp) != NULL; fp = &f1->next) + for (fp = &f->bkt->ht[h]; (f1 = *fp) != NULL; fp = &f1->next) if (f->handle < f1->handle) break; @@ -492,7 +497,8 @@ reinsert: if (old_handle && f->handle != old_handle) { th = to_hash(old_handle); h = from_hash(old_handle >> 16); - if ((b = head->table[th]) != NULL) { + b = head->table[th]; + if (b) { for (fp = &b->ht[h]; *fp; fp = &(*fp)->next) { if (*fp == f) { *fp = f->next; @@ -515,7 +521,7 @@ errout: static void route4_walk(struct tcf_proto *tp, struct tcf_walker *arg) { struct route4_head *head = tp->root; - unsigned h, h1; + unsigned int h, h1; if (head == NULL) arg->stop = 1; @@ -549,7 +555,7 @@ static void route4_walk(struct tcf_proto *tp, struct tcf_walker *arg) static int route4_dump(struct tcf_proto *tp, unsigned long fh, struct sk_buff *skb, struct tcmsg *t) { - struct route4_filter *f = (struct route4_filter*)fh; + struct route4_filter *f = (struct route4_filter *)fh; unsigned char *b = skb_tail_pointer(skb); struct nlattr *nest; u32 id; @@ -563,15 +569,15 @@ static int route4_dump(struct tcf_proto *tp, unsigned long fh, if (nest == NULL) goto nla_put_failure; - if (!(f->handle&0x8000)) { - id = f->id&0xFF; + if (!(f->handle & 0x8000)) { + id = f->id & 0xFF; NLA_PUT_U32(skb, TCA_ROUTE4_TO, id); } - if (f->handle&0x80000000) { - if ((f->handle>>16) != 0xFFFF) + if (f->handle & 0x80000000) { + if ((f->handle >> 16) != 0xFFFF) NLA_PUT_U32(skb, TCA_ROUTE4_IIF, f->iif); } else { - id = f->id>>16; + id = f->id >> 16; NLA_PUT_U32(skb, TCA_ROUTE4_FROM, id); } if (f->res.classid) diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h index 425a1790b04..402c44b241a 100644 --- a/net/sched/cls_rsvp.h +++ b/net/sched/cls_rsvp.h @@ -66,28 +66,25 @@ powerful classification engine. */ -struct rsvp_head -{ +struct rsvp_head { u32 tmap[256/32]; u32 hgenerator; u8 tgenerator; struct rsvp_session *ht[256]; }; -struct rsvp_session -{ +struct rsvp_session { struct rsvp_session *next; __be32 dst[RSVP_DST_LEN]; struct tc_rsvp_gpi dpi; u8 protocol; u8 tunnelid; /* 16 (src,sport) hash slots, and one wildcard source slot */ - struct rsvp_filter *ht[16+1]; + struct rsvp_filter *ht[16 + 1]; }; -struct rsvp_filter -{ +struct rsvp_filter { struct rsvp_filter *next; __be32 src[RSVP_DST_LEN]; struct tc_rsvp_gpi spi; @@ -100,17 +97,19 @@ struct rsvp_filter struct rsvp_session *sess; }; -static __inline__ unsigned hash_dst(__be32 *dst, u8 protocol, u8 tunnelid) +static inline unsigned int hash_dst(__be32 *dst, u8 protocol, u8 tunnelid) { - unsigned h = (__force __u32)dst[RSVP_DST_LEN-1]; + unsigned int h = (__force __u32)dst[RSVP_DST_LEN - 1]; + h ^= h>>16; h ^= h>>8; return (h ^ protocol ^ tunnelid) & 0xFF; } -static __inline__ unsigned hash_src(__be32 *src) +static inline unsigned int hash_src(__be32 *src) { - unsigned h = (__force __u32)src[RSVP_DST_LEN-1]; + unsigned int h = (__force __u32)src[RSVP_DST_LEN-1]; + h ^= h>>16; h ^= h>>8; h ^= h>>4; @@ -134,10 +133,10 @@ static struct tcf_ext_map rsvp_ext_map = { static int rsvp_classify(struct sk_buff *skb, struct tcf_proto *tp, struct tcf_result *res) { - struct rsvp_session **sht = ((struct rsvp_head*)tp->root)->ht; + struct rsvp_session **sht = ((struct rsvp_head *)tp->root)->ht; struct rsvp_session *s; struct rsvp_filter *f; - unsigned h1, h2; + unsigned int h1, h2; __be32 *dst, *src; u8 protocol; u8 tunnelid = 0; @@ -162,13 +161,13 @@ restart: src = &nhptr->saddr.s6_addr32[0]; dst = &nhptr->daddr.s6_addr32[0]; protocol = nhptr->nexthdr; - xprt = ((u8*)nhptr) + sizeof(struct ipv6hdr); + xprt = ((u8 *)nhptr) + sizeof(struct ipv6hdr); #else src = &nhptr->saddr; dst = &nhptr->daddr; protocol = nhptr->protocol; - xprt = ((u8*)nhptr) + (nhptr->ihl<<2); - if (nhptr->frag_off & htons(IP_MF|IP_OFFSET)) + xprt = ((u8 *)nhptr) + (nhptr->ihl<<2); + if (nhptr->frag_off & htons(IP_MF | IP_OFFSET)) return -1; #endif @@ -176,10 +175,10 @@ restart: h2 = hash_src(src); for (s = sht[h1]; s; s = s->next) { - if (dst[RSVP_DST_LEN-1] == s->dst[RSVP_DST_LEN-1] && + if (dst[RSVP_DST_LEN-1] == s->dst[RSVP_DST_LEN - 1] && protocol == s->protocol && !(s->dpi.mask & - (*(u32*)(xprt+s->dpi.offset)^s->dpi.key)) && + (*(u32 *)(xprt + s->dpi.offset) ^ s->dpi.key)) && #if RSVP_DST_LEN == 4 dst[0] == s->dst[0] && dst[1] == s->dst[1] && @@ -188,8 +187,8 @@ restart: tunnelid == s->tunnelid) { for (f = s->ht[h2]; f; f = f->next) { - if (src[RSVP_DST_LEN-1] == f->src[RSVP_DST_LEN-1] && - !(f->spi.mask & (*(u32*)(xprt+f->spi.offset)^f->spi.key)) + if (src[RSVP_DST_LEN-1] == f->src[RSVP_DST_LEN - 1] && + !(f->spi.mask & (*(u32 *)(xprt + f->spi.offset) ^ f->spi.key)) #if RSVP_DST_LEN == 4 && src[0] == f->src[0] && @@ -205,7 +204,7 @@ matched: return 0; tunnelid = f->res.classid; - nhptr = (void*)(xprt + f->tunnelhdr - sizeof(*nhptr)); + nhptr = (void *)(xprt + f->tunnelhdr - sizeof(*nhptr)); goto restart; } } @@ -224,11 +223,11 @@ matched: static unsigned long rsvp_get(struct tcf_proto *tp, u32 handle) { - struct rsvp_session **sht = ((struct rsvp_head*)tp->root)->ht; + struct rsvp_session **sht = ((struct rsvp_head *)tp->root)->ht; struct rsvp_session *s; struct rsvp_filter *f; - unsigned h1 = handle&0xFF; - unsigned h2 = (handle>>8)&0xFF; + unsigned int h1 = handle & 0xFF; + unsigned int h2 = (handle >> 8) & 0xFF; if (h2 > 16) return 0; @@ -258,7 +257,7 @@ static int rsvp_init(struct tcf_proto *tp) return -ENOBUFS; } -static inline void +static void rsvp_delete_filter(struct tcf_proto *tp, struct rsvp_filter *f) { tcf_unbind_filter(tp, &f->res); @@ -277,13 +276,13 @@ static void rsvp_destroy(struct tcf_proto *tp) sht = data->ht; - for (h1=0; h1<256; h1++) { + for (h1 = 0; h1 < 256; h1++) { struct rsvp_session *s; while ((s = sht[h1]) != NULL) { sht[h1] = s->next; - for (h2=0; h2<=16; h2++) { + for (h2 = 0; h2 <= 16; h2++) { struct rsvp_filter *f; while ((f = s->ht[h2]) != NULL) { @@ -299,13 +298,13 @@ static void rsvp_destroy(struct tcf_proto *tp) static int rsvp_delete(struct tcf_proto *tp, unsigned long arg) { - struct rsvp_filter **fp, *f = (struct rsvp_filter*)arg; - unsigned h = f->handle; + struct rsvp_filter **fp, *f = (struct rsvp_filter *)arg; + unsigned int h = f->handle; struct rsvp_session **sp; struct rsvp_session *s = f->sess; int i; - for (fp = &s->ht[(h>>8)&0xFF]; *fp; fp = &(*fp)->next) { + for (fp = &s->ht[(h >> 8) & 0xFF]; *fp; fp = &(*fp)->next) { if (*fp == f) { tcf_tree_lock(tp); *fp = f->next; @@ -314,12 +313,12 @@ static int rsvp_delete(struct tcf_proto *tp, unsigned long arg) /* Strip tree */ - for (i=0; i<=16; i++) + for (i = 0; i <= 16; i++) if (s->ht[i]) return 0; /* OK, session has no flows */ - for (sp = &((struct rsvp_head*)tp->root)->ht[h&0xFF]; + for (sp = &((struct rsvp_head *)tp->root)->ht[h & 0xFF]; *sp; sp = &(*sp)->next) { if (*sp == s) { tcf_tree_lock(tp); @@ -337,13 +336,14 @@ static int rsvp_delete(struct tcf_proto *tp, unsigned long arg) return 0; } -static unsigned gen_handle(struct tcf_proto *tp, unsigned salt) +static unsigned int gen_handle(struct tcf_proto *tp, unsigned salt) { struct rsvp_head *data = tp->root; int i = 0xFFFF; while (i-- > 0) { u32 h; + if ((data->hgenerator += 0x10000) == 0) data->hgenerator = 0x10000; h = data->hgenerator|salt; @@ -355,10 +355,10 @@ static unsigned gen_handle(struct tcf_proto *tp, unsigned salt) static int tunnel_bts(struct rsvp_head *data) { - int n = data->tgenerator>>5; - u32 b = 1<<(data->tgenerator&0x1F); + int n = data->tgenerator >> 5; + u32 b = 1 << (data->tgenerator & 0x1F); - if (data->tmap[n]&b) + if (data->tmap[n] & b) return 0; data->tmap[n] |= b; return 1; @@ -372,10 +372,10 @@ static void tunnel_recycle(struct rsvp_head *data) memset(tmap, 0, sizeof(tmap)); - for (h1=0; h1<256; h1++) { + for (h1 = 0; h1 < 256; h1++) { struct rsvp_session *s; for (s = sht[h1]; s; s = s->next) { - for (h2=0; h2<=16; h2++) { + for (h2 = 0; h2 <= 16; h2++) { struct rsvp_filter *f; for (f = s->ht[h2]; f; f = f->next) { @@ -395,8 +395,8 @@ static u32 gen_tunnel(struct rsvp_head *data) { int i, k; - for (k=0; k<2; k++) { - for (i=255; i>0; i--) { + for (k = 0; k < 2; k++) { + for (i = 255; i > 0; i--) { if (++data->tgenerator == 0) data->tgenerator = 1; if (tunnel_bts(data)) @@ -428,7 +428,7 @@ static int rsvp_change(struct tcf_proto *tp, unsigned long base, struct nlattr *opt = tca[TCA_OPTIONS-1]; struct nlattr *tb[TCA_RSVP_MAX + 1]; struct tcf_exts e; - unsigned h1, h2; + unsigned int h1, h2; __be32 *dst; int err; @@ -443,7 +443,8 @@ static int rsvp_change(struct tcf_proto *tp, unsigned long base, if (err < 0) return err; - if ((f = (struct rsvp_filter*)*arg) != NULL) { + f = (struct rsvp_filter *)*arg; + if (f) { /* Node exists: adjust only classid */ if (f->handle != handle && handle) @@ -500,7 +501,7 @@ static int rsvp_change(struct tcf_proto *tp, unsigned long base, goto errout; } - for (sp = &data->ht[h1]; (s=*sp) != NULL; sp = &s->next) { + for (sp = &data->ht[h1]; (s = *sp) != NULL; sp = &s->next) { if (dst[RSVP_DST_LEN-1] == s->dst[RSVP_DST_LEN-1] && pinfo && pinfo->protocol == s->protocol && memcmp(&pinfo->dpi, &s->dpi, sizeof(s->dpi)) == 0 && @@ -523,7 +524,7 @@ insert: tcf_exts_change(tp, &f->exts, &e); for (fp = &s->ht[h2]; *fp; fp = &(*fp)->next) - if (((*fp)->spi.mask&f->spi.mask) != f->spi.mask) + if (((*fp)->spi.mask & f->spi.mask) != f->spi.mask) break; f->next = *fp; wmb(); @@ -567,7 +568,7 @@ errout2: static void rsvp_walk(struct tcf_proto *tp, struct tcf_walker *arg) { struct rsvp_head *head = tp->root; - unsigned h, h1; + unsigned int h, h1; if (arg->stop) return; @@ -598,7 +599,7 @@ static void rsvp_walk(struct tcf_proto *tp, struct tcf_walker *arg) static int rsvp_dump(struct tcf_proto *tp, unsigned long fh, struct sk_buff *skb, struct tcmsg *t) { - struct rsvp_filter *f = (struct rsvp_filter*)fh; + struct rsvp_filter *f = (struct rsvp_filter *)fh; struct rsvp_session *s; unsigned char *b = skb_tail_pointer(skb); struct nlattr *nest; @@ -624,7 +625,7 @@ static int rsvp_dump(struct tcf_proto *tp, unsigned long fh, NLA_PUT(skb, TCA_RSVP_PINFO, sizeof(pinfo), &pinfo); if (f->res.classid) NLA_PUT_U32(skb, TCA_RSVP_CLASSID, f->res.classid); - if (((f->handle>>8)&0xFF) != 16) + if (((f->handle >> 8) & 0xFF) != 16) NLA_PUT(skb, TCA_RSVP_SRC, sizeof(f->src), f->src); if (tcf_exts_dump(skb, &f->exts, &rsvp_ext_map) < 0) diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c index 20ef330bb91..36667fa6423 100644 --- a/net/sched/cls_tcindex.c +++ b/net/sched/cls_tcindex.c @@ -249,7 +249,7 @@ tcindex_set_parms(struct tcf_proto *tp, unsigned long base, u32 handle, * of the hashing index is below the threshold. */ if ((cp.mask >> cp.shift) < PERFECT_HASH_THRESHOLD) - cp.hash = (cp.mask >> cp.shift)+1; + cp.hash = (cp.mask >> cp.shift) + 1; else cp.hash = DEFAULT_HASH_SIZE; } diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index b0c2a82178a..966920c14e7 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -42,8 +42,7 @@ #include <net/act_api.h> #include <net/pkt_cls.h> -struct tc_u_knode -{ +struct tc_u_knode { struct tc_u_knode *next; u32 handle; struct tc_u_hnode *ht_up; @@ -63,19 +62,17 @@ struct tc_u_knode struct tc_u32_sel sel; }; -struct tc_u_hnode -{ +struct tc_u_hnode { struct tc_u_hnode *next; u32 handle; u32 prio; struct tc_u_common *tp_c; int refcnt; - unsigned divisor; + unsigned int divisor; struct tc_u_knode *ht[1]; }; -struct tc_u_common -{ +struct tc_u_common { struct tc_u_hnode *hlist; struct Qdisc *q; int refcnt; @@ -87,9 +84,11 @@ static const struct tcf_ext_map u32_ext_map = { .police = TCA_U32_POLICE }; -static __inline__ unsigned u32_hash_fold(__be32 key, struct tc_u32_sel *sel, u8 fshift) +static inline unsigned int u32_hash_fold(__be32 key, + const struct tc_u32_sel *sel, + u8 fshift) { - unsigned h = ntohl(key & sel->hmask)>>fshift; + unsigned int h = ntohl(key & sel->hmask) >> fshift; return h; } @@ -101,7 +100,7 @@ static int u32_classify(struct sk_buff *skb, struct tcf_proto *tp, struct tcf_re unsigned int off; } stack[TC_U32_MAXDEPTH]; - struct tc_u_hnode *ht = (struct tc_u_hnode*)tp->root; + struct tc_u_hnode *ht = (struct tc_u_hnode *)tp->root; unsigned int off = skb_network_offset(skb); struct tc_u_knode *n; int sdepth = 0; @@ -120,7 +119,7 @@ next_knode: struct tc_u32_key *key = n->sel.keys; #ifdef CONFIG_CLS_U32_PERF - n->pf->rcnt +=1; + n->pf->rcnt += 1; j = 0; #endif @@ -133,7 +132,7 @@ next_knode: } #endif - for (i = n->sel.nkeys; i>0; i--, key++) { + for (i = n->sel.nkeys; i > 0; i--, key++) { int toff = off + key->off + (off2 & key->offmask); __be32 *data, _data; @@ -148,13 +147,13 @@ next_knode: goto next_knode; } #ifdef CONFIG_CLS_U32_PERF - n->pf->kcnts[j] +=1; + n->pf->kcnts[j] += 1; j++; #endif } if (n->ht_down == NULL) { check_terminal: - if (n->sel.flags&TC_U32_TERMINAL) { + if (n->sel.flags & TC_U32_TERMINAL) { *res = n->res; #ifdef CONFIG_NET_CLS_IND @@ -164,7 +163,7 @@ check_terminal: } #endif #ifdef CONFIG_CLS_U32_PERF - n->pf->rhit +=1; + n->pf->rhit += 1; #endif r = tcf_exts_exec(skb, &n->exts, res); if (r < 0) { @@ -197,10 +196,10 @@ check_terminal: sel = ht->divisor & u32_hash_fold(*data, &n->sel, n->fshift); } - if (!(n->sel.flags&(TC_U32_VAROFFSET|TC_U32_OFFSET|TC_U32_EAT))) + if (!(n->sel.flags & (TC_U32_VAROFFSET | TC_U32_OFFSET | TC_U32_EAT))) goto next_ht; - if (n->sel.flags&(TC_U32_OFFSET|TC_U32_VAROFFSET)) { + if (n->sel.flags & (TC_U32_OFFSET | TC_U32_VAROFFSET)) { off2 = n->sel.off + 3; if (n->sel.flags & TC_U32_VAROFFSET) { __be16 *data, _data; @@ -215,7 +214,7 @@ check_terminal: } off2 &= ~3; } - if (n->sel.flags&TC_U32_EAT) { + if (n->sel.flags & TC_U32_EAT) { off += off2; off2 = 0; } @@ -236,11 +235,11 @@ out: deadloop: if (net_ratelimit()) - printk(KERN_WARNING "cls_u32: dead loop\n"); + pr_warning("cls_u32: dead loop\n"); return -1; } -static __inline__ struct tc_u_hnode * +static struct tc_u_hnode * u32_lookup_ht(struct tc_u_common *tp_c, u32 handle) { struct tc_u_hnode *ht; @@ -252,10 +251,10 @@ u32_lookup_ht(struct tc_u_common *tp_c, u32 handle) return ht; } -static __inline__ struct tc_u_knode * +static struct tc_u_knode * u32_lookup_key(struct tc_u_hnode *ht, u32 handle) { - unsigned sel; + unsigned int sel; struct tc_u_knode *n = NULL; sel = TC_U32_HASH(handle); @@ -300,7 +299,7 @@ static u32 gen_new_htid(struct tc_u_common *tp_c) do { if (++tp_c->hgenerator == 0x7FF) tp_c->hgenerator = 1; - } while (--i>0 && u32_lookup_ht(tp_c, (tp_c->hgenerator|0x800)<<20)); + } while (--i > 0 && u32_lookup_ht(tp_c, (tp_c->hgenerator|0x800)<<20)); return i > 0 ? (tp_c->hgenerator|0x800)<<20 : 0; } @@ -378,9 +377,9 @@ static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode* key) static void u32_clear_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht) { struct tc_u_knode *n; - unsigned h; + unsigned int h; - for (h=0; h<=ht->divisor; h++) { + for (h = 0; h <= ht->divisor; h++) { while ((n = ht->ht[h]) != NULL) { ht->ht[h] = n->next; @@ -446,13 +445,13 @@ static void u32_destroy(struct tcf_proto *tp) static int u32_delete(struct tcf_proto *tp, unsigned long arg) { - struct tc_u_hnode *ht = (struct tc_u_hnode*)arg; + struct tc_u_hnode *ht = (struct tc_u_hnode *)arg; if (ht == NULL) return 0; if (TC_U32_KEY(ht->handle)) - return u32_delete_key(tp, (struct tc_u_knode*)ht); + return u32_delete_key(tp, (struct tc_u_knode *)ht); if (tp->root == ht) return -EINVAL; @@ -470,14 +469,14 @@ static int u32_delete(struct tcf_proto *tp, unsigned long arg) static u32 gen_new_kid(struct tc_u_hnode *ht, u32 handle) { struct tc_u_knode *n; - unsigned i = 0x7FF; + unsigned int i = 0x7FF; - for (n=ht->ht[TC_U32_HASH(handle)]; n; n = n->next) + for (n = ht->ht[TC_U32_HASH(handle)]; n; n = n->next) if (i < TC_U32_NODE(n->handle)) i = TC_U32_NODE(n->handle); i++; - return handle|(i>0xFFF ? 0xFFF : i); + return handle | (i > 0xFFF ? 0xFFF : i); } static const struct nla_policy u32_policy[TCA_U32_MAX + 1] = { @@ -566,7 +565,8 @@ static int u32_change(struct tcf_proto *tp, unsigned long base, u32 handle, if (err < 0) return err; - if ((n = (struct tc_u_knode*)*arg) != NULL) { + n = (struct tc_u_knode *)*arg; + if (n) { if (TC_U32_KEY(n->handle) == 0) return -EINVAL; @@ -574,7 +574,7 @@ static int u32_change(struct tcf_proto *tp, unsigned long base, u32 handle, } if (tb[TCA_U32_DIVISOR]) { - unsigned divisor = nla_get_u32(tb[TCA_U32_DIVISOR]); + unsigned int divisor = nla_get_u32(tb[TCA_U32_DIVISOR]); if (--divisor > 0x100) return -EINVAL; @@ -585,7 +585,7 @@ static int u32_change(struct tcf_proto *tp, unsigned long base, u32 handle, if (handle == 0) return -ENOMEM; } - ht = kzalloc(sizeof(*ht) + divisor*sizeof(void*), GFP_KERNEL); + ht = kzalloc(sizeof(*ht) + divisor*sizeof(void *), GFP_KERNEL); if (ht == NULL) return -ENOBUFS; ht->tp_c = tp_c; @@ -683,7 +683,7 @@ static void u32_walk(struct tcf_proto *tp, struct tcf_walker *arg) struct tc_u_common *tp_c = tp->data; struct tc_u_hnode *ht; struct tc_u_knode *n; - unsigned h; + unsigned int h; if (arg->stop) return; @@ -717,7 +717,7 @@ static void u32_walk(struct tcf_proto *tp, struct tcf_walker *arg) static int u32_dump(struct tcf_proto *tp, unsigned long fh, struct sk_buff *skb, struct tcmsg *t) { - struct tc_u_knode *n = (struct tc_u_knode*)fh; + struct tc_u_knode *n = (struct tc_u_knode *)fh; struct nlattr *nest; if (n == NULL) @@ -730,8 +730,9 @@ static int u32_dump(struct tcf_proto *tp, unsigned long fh, goto nla_put_failure; if (TC_U32_KEY(n->handle) == 0) { - struct tc_u_hnode *ht = (struct tc_u_hnode*)fh; - u32 divisor = ht->divisor+1; + struct tc_u_hnode *ht = (struct tc_u_hnode *)fh; + u32 divisor = ht->divisor + 1; + NLA_PUT_U32(skb, TCA_U32_DIVISOR, divisor); } else { NLA_PUT(skb, TCA_U32_SEL, @@ -755,7 +756,7 @@ static int u32_dump(struct tcf_proto *tp, unsigned long fh, goto nla_put_failure; #ifdef CONFIG_NET_CLS_IND - if(strlen(n->indev)) + if (strlen(n->indev)) NLA_PUT_STRING(skb, TCA_U32_INDEV, n->indev); #endif #ifdef CONFIG_CLS_U32_PERF diff --git a/net/sched/em_cmp.c b/net/sched/em_cmp.c index bc450397487..1c8360a2752 100644 --- a/net/sched/em_cmp.c +++ b/net/sched/em_cmp.c @@ -33,40 +33,41 @@ static int em_cmp_match(struct sk_buff *skb, struct tcf_ematch *em, return 0; switch (cmp->align) { - case TCF_EM_ALIGN_U8: - val = *ptr; - break; + case TCF_EM_ALIGN_U8: + val = *ptr; + break; - case TCF_EM_ALIGN_U16: - val = get_unaligned_be16(ptr); + case TCF_EM_ALIGN_U16: + val = get_unaligned_be16(ptr); - if (cmp_needs_transformation(cmp)) - val = be16_to_cpu(val); - break; + if (cmp_needs_transformation(cmp)) + val = be16_to_cpu(val); + break; - case TCF_EM_ALIGN_U32: - /* Worth checking boundries? The branching seems - * to get worse. Visit again. */ - val = get_unaligned_be32(ptr); + case TCF_EM_ALIGN_U32: + /* Worth checking boundries? The branching seems + * to get worse. Visit again. + */ + val = get_unaligned_be32(ptr); - if (cmp_needs_transformation(cmp)) - val = be32_to_cpu(val); - break; + if (cmp_needs_transformation(cmp)) + val = be32_to_cpu(val); + break; - default: - return 0; + default: + return 0; } if (cmp->mask) val &= cmp->mask; switch (cmp->opnd) { - case TCF_EM_OPND_EQ: - return val == cmp->val; - case TCF_EM_OPND_LT: - return val < cmp->val; - case TCF_EM_OPND_GT: - return val > cmp->val; + case TCF_EM_OPND_EQ: + return val == cmp->val; + case TCF_EM_OPND_LT: + return val < cmp->val; + case TCF_EM_OPND_GT: + return val > cmp->val; } return 0; diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c index 34da5e29ea1..a889d099320 100644 --- a/net/sched/em_meta.c +++ b/net/sched/em_meta.c @@ -73,21 +73,18 @@ #include <net/pkt_cls.h> #include <net/sock.h> -struct meta_obj -{ +struct meta_obj { unsigned long value; unsigned int len; }; -struct meta_value -{ +struct meta_value { struct tcf_meta_val hdr; unsigned long val; unsigned int len; }; -struct meta_match -{ +struct meta_match { struct meta_value lvalue; struct meta_value rvalue; }; @@ -255,7 +252,7 @@ META_COLLECTOR(int_rtclassid) if (unlikely(skb_dst(skb) == NULL)) *err = -1; else -#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_ROUTE_CLASSID dst->value = skb_dst(skb)->tclassid; #else dst->value = 0; @@ -483,8 +480,7 @@ META_COLLECTOR(int_sk_write_pend) * Meta value collectors assignment table **************************************************************************/ -struct meta_ops -{ +struct meta_ops { void (*get)(struct sk_buff *, struct tcf_pkt_info *, struct meta_value *, struct meta_obj *, int *); }; @@ -494,7 +490,7 @@ struct meta_ops /* Meta value operations table listing all meta value collectors and * assigns them to a type and meta id. */ -static struct meta_ops __meta_ops[TCF_META_TYPE_MAX+1][TCF_META_ID_MAX+1] = { +static struct meta_ops __meta_ops[TCF_META_TYPE_MAX + 1][TCF_META_ID_MAX + 1] = { [TCF_META_TYPE_VAR] = { [META_ID(DEV)] = META_FUNC(var_dev), [META_ID(SK_BOUND_IF)] = META_FUNC(var_sk_bound_if), @@ -550,7 +546,7 @@ static struct meta_ops __meta_ops[TCF_META_TYPE_MAX+1][TCF_META_ID_MAX+1] = { } }; -static inline struct meta_ops * meta_ops(struct meta_value *val) +static inline struct meta_ops *meta_ops(struct meta_value *val) { return &__meta_ops[meta_type(val)][meta_id(val)]; } @@ -649,9 +645,8 @@ static int meta_int_dump(struct sk_buff *skb, struct meta_value *v, int tlv) { if (v->len == sizeof(unsigned long)) NLA_PUT(skb, tlv, sizeof(unsigned long), &v->val); - else if (v->len == sizeof(u32)) { + else if (v->len == sizeof(u32)) NLA_PUT_U32(skb, tlv, v->val); - } return 0; @@ -663,8 +658,7 @@ nla_put_failure: * Type specific operations table **************************************************************************/ -struct meta_type_ops -{ +struct meta_type_ops { void (*destroy)(struct meta_value *); int (*compare)(struct meta_obj *, struct meta_obj *); int (*change)(struct meta_value *, struct nlattr *); @@ -672,7 +666,7 @@ struct meta_type_ops int (*dump)(struct sk_buff *, struct meta_value *, int); }; -static struct meta_type_ops __meta_type_ops[TCF_META_TYPE_MAX+1] = { +static struct meta_type_ops __meta_type_ops[TCF_META_TYPE_MAX + 1] = { [TCF_META_TYPE_VAR] = { .destroy = meta_var_destroy, .compare = meta_var_compare, @@ -688,7 +682,7 @@ static struct meta_type_ops __meta_type_ops[TCF_META_TYPE_MAX+1] = { } }; -static inline struct meta_type_ops * meta_type_ops(struct meta_value *v) +static inline struct meta_type_ops *meta_type_ops(struct meta_value *v) { return &__meta_type_ops[meta_type(v)]; } @@ -713,7 +707,7 @@ static int meta_get(struct sk_buff *skb, struct tcf_pkt_info *info, return err; if (meta_type_ops(v)->apply_extras) - meta_type_ops(v)->apply_extras(v, dst); + meta_type_ops(v)->apply_extras(v, dst); return 0; } @@ -732,12 +726,12 @@ static int em_meta_match(struct sk_buff *skb, struct tcf_ematch *m, r = meta_type_ops(&meta->lvalue)->compare(&l_value, &r_value); switch (meta->lvalue.hdr.op) { - case TCF_EM_OPND_EQ: - return !r; - case TCF_EM_OPND_LT: - return r < 0; - case TCF_EM_OPND_GT: - return r > 0; + case TCF_EM_OPND_EQ: + return !r; + case TCF_EM_OPND_LT: + return r < 0; + case TCF_EM_OPND_GT: + return r > 0; } return 0; @@ -771,7 +765,7 @@ static inline int meta_change_data(struct meta_value *dst, struct nlattr *nla) static inline int meta_is_supported(struct meta_value *val) { - return (!meta_id(val) || meta_ops(val)->get); + return !meta_id(val) || meta_ops(val)->get; } static const struct nla_policy meta_policy[TCA_EM_META_MAX + 1] = { diff --git a/net/sched/em_nbyte.c b/net/sched/em_nbyte.c index 1a4176aee6e..a3bed07a008 100644 --- a/net/sched/em_nbyte.c +++ b/net/sched/em_nbyte.c @@ -18,8 +18,7 @@ #include <linux/tc_ematch/tc_em_nbyte.h> #include <net/pkt_cls.h> -struct nbyte_data -{ +struct nbyte_data { struct tcf_em_nbyte hdr; char pattern[0]; }; diff --git a/net/sched/em_text.c b/net/sched/em_text.c index ea8f566e720..15d353d2e4b 100644 --- a/net/sched/em_text.c +++ b/net/sched/em_text.c @@ -19,8 +19,7 @@ #include <linux/tc_ematch/tc_em_text.h> #include <net/pkt_cls.h> -struct text_match -{ +struct text_match { u16 from_offset; u16 to_offset; u8 from_layer; diff --git a/net/sched/em_u32.c b/net/sched/em_u32.c index 953f1479f7d..797bdb88c01 100644 --- a/net/sched/em_u32.c +++ b/net/sched/em_u32.c @@ -35,7 +35,7 @@ static int em_u32_match(struct sk_buff *skb, struct tcf_ematch *em, if (!tcf_valid_offset(skb, ptr, sizeof(u32))) return 0; - return !(((*(__be32*) ptr) ^ key->val) & key->mask); + return !(((*(__be32 *) ptr) ^ key->val) & key->mask); } static struct tcf_ematch_ops em_u32_ops = { diff --git a/net/sched/ematch.c b/net/sched/ematch.c index 5e37da961f8..88d93eb9250 100644 --- a/net/sched/ematch.c +++ b/net/sched/ematch.c @@ -93,7 +93,7 @@ static LIST_HEAD(ematch_ops); static DEFINE_RWLOCK(ematch_mod_lock); -static inline struct tcf_ematch_ops * tcf_em_lookup(u16 kind) +static struct tcf_ematch_ops *tcf_em_lookup(u16 kind) { struct tcf_ematch_ops *e = NULL; @@ -163,8 +163,8 @@ void tcf_em_unregister(struct tcf_ematch_ops *ops) } EXPORT_SYMBOL(tcf_em_unregister); -static inline struct tcf_ematch * tcf_em_get_match(struct tcf_ematch_tree *tree, - int index) +static inline struct tcf_ematch *tcf_em_get_match(struct tcf_ematch_tree *tree, + int index) { return &tree->matches[index]; } @@ -184,7 +184,8 @@ static int tcf_em_validate(struct tcf_proto *tp, if (em_hdr->kind == TCF_EM_CONTAINER) { /* Special ematch called "container", carries an index - * referencing an external ematch sequence. */ + * referencing an external ematch sequence. + */ u32 ref; if (data_len < sizeof(ref)) @@ -195,7 +196,8 @@ static int tcf_em_validate(struct tcf_proto *tp, goto errout; /* We do not allow backward jumps to avoid loops and jumps - * to our own position are of course illegal. */ + * to our own position are of course illegal. + */ if (ref <= idx) goto errout; @@ -208,7 +210,8 @@ static int tcf_em_validate(struct tcf_proto *tp, * which automatically releases the reference again, therefore * the module MUST not be given back under any circumstances * here. Be aware, the destroy function assumes that the - * module is held if the ops field is non zero. */ + * module is held if the ops field is non zero. + */ em->ops = tcf_em_lookup(em_hdr->kind); if (em->ops == NULL) { @@ -221,7 +224,8 @@ static int tcf_em_validate(struct tcf_proto *tp, if (em->ops) { /* We dropped the RTNL mutex in order to * perform the module load. Tell the caller - * to replay the request. */ + * to replay the request. + */ module_put(em->ops->owner); err = -EAGAIN; } @@ -230,7 +234,8 @@ static int tcf_em_validate(struct tcf_proto *tp, } /* ematch module provides expected length of data, so we - * can do a basic sanity check. */ + * can do a basic sanity check. + */ if (em->ops->datalen && data_len < em->ops->datalen) goto errout; @@ -246,7 +251,8 @@ static int tcf_em_validate(struct tcf_proto *tp, * TCF_EM_SIMPLE may be specified stating that the * data only consists of a u32 integer and the module * does not expected a memory reference but rather - * the value carried. */ + * the value carried. + */ if (em_hdr->flags & TCF_EM_SIMPLE) { if (data_len < sizeof(u32)) goto errout; @@ -334,7 +340,8 @@ int tcf_em_tree_validate(struct tcf_proto *tp, struct nlattr *nla, * The array of rt attributes is parsed in the order as they are * provided, their type must be incremental from 1 to n. Even * if it does not serve any real purpose, a failure of sticking - * to this policy will result in parsing failure. */ + * to this policy will result in parsing failure. + */ for (idx = 0; nla_ok(rt_match, list_len); idx++) { err = -EINVAL; @@ -359,7 +366,8 @@ int tcf_em_tree_validate(struct tcf_proto *tp, struct nlattr *nla, /* Check if the number of matches provided by userspace actually * complies with the array of matches. The number was used for * the validation of references and a mismatch could lead to - * undefined references during the matching process. */ + * undefined references during the matching process. + */ if (idx != tree_hdr->nmatches) { err = -EINVAL; goto errout_abort; @@ -449,7 +457,7 @@ int tcf_em_tree_dump(struct sk_buff *skb, struct tcf_ematch_tree *tree, int tlv) .flags = em->flags }; - NLA_PUT(skb, i+1, sizeof(em_hdr), &em_hdr); + NLA_PUT(skb, i + 1, sizeof(em_hdr), &em_hdr); if (em->ops && em->ops->dump) { if (em->ops->dump(skb, em) < 0) @@ -478,6 +486,7 @@ static inline int tcf_em_match(struct sk_buff *skb, struct tcf_ematch *em, struct tcf_pkt_info *info) { int r = em->ops->match(skb, em, info); + return tcf_em_is_inverted(em) ? !r : r; } @@ -527,8 +536,8 @@ pop_stack: stack_overflow: if (net_ratelimit()) - printk(KERN_WARNING "tc ematch: local stack overflow," - " increase NET_EMATCH_STACK\n"); + pr_warning("tc ematch: local stack overflow," + " increase NET_EMATCH_STACK\n"); return -1; } EXPORT_SYMBOL(__tcf_em_tree_match); diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index b22ca2d1ceb..15074157940 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -187,7 +187,7 @@ int unregister_qdisc(struct Qdisc_ops *qops) int err = -ENOENT; write_lock(&qdisc_mod_lock); - for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next) + for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next) if (q == qops) break; if (q) { @@ -321,7 +321,9 @@ void qdisc_put_rtab(struct qdisc_rate_table *tab) if (!tab || --tab->refcnt) return; - for (rtabp = &qdisc_rtab_list; (rtab=*rtabp) != NULL; rtabp = &rtab->next) { + for (rtabp = &qdisc_rtab_list; + (rtab = *rtabp) != NULL; + rtabp = &rtab->next) { if (rtab == tab) { *rtabp = rtab->next; kfree(rtab); @@ -396,6 +398,11 @@ static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt) return stab; } +static void stab_kfree_rcu(struct rcu_head *head) +{ + kfree(container_of(head, struct qdisc_size_table, rcu)); +} + void qdisc_put_stab(struct qdisc_size_table *tab) { if (!tab) @@ -405,7 +412,7 @@ void qdisc_put_stab(struct qdisc_size_table *tab) if (--tab->refcnt == 0) { list_del(&tab->list); - kfree(tab); + call_rcu_bh(&tab->rcu, stab_kfree_rcu); } spin_unlock(&qdisc_stab_lock); @@ -428,7 +435,7 @@ nla_put_failure: return -1; } -void qdisc_calculate_pkt_len(struct sk_buff *skb, struct qdisc_size_table *stab) +void __qdisc_calculate_pkt_len(struct sk_buff *skb, const struct qdisc_size_table *stab) { int pkt_len, slot; @@ -454,14 +461,13 @@ out: pkt_len = 1; qdisc_skb_cb(skb)->pkt_len = pkt_len; } -EXPORT_SYMBOL(qdisc_calculate_pkt_len); +EXPORT_SYMBOL(__qdisc_calculate_pkt_len); void qdisc_warn_nonwc(char *txt, struct Qdisc *qdisc) { if (!(qdisc->flags & TCQ_F_WARN_NONWC)) { - printk(KERN_WARNING - "%s: %s qdisc %X: is non-work-conserving?\n", - txt, qdisc->ops->id, qdisc->handle >> 16); + pr_warn("%s: %s qdisc %X: is non-work-conserving?\n", + txt, qdisc->ops->id, qdisc->handle >> 16); qdisc->flags |= TCQ_F_WARN_NONWC; } } @@ -472,7 +478,7 @@ static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer) struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog, timer); - wd->qdisc->flags &= ~TCQ_F_THROTTLED; + qdisc_unthrottled(wd->qdisc); __netif_schedule(qdisc_root(wd->qdisc)); return HRTIMER_NORESTART; @@ -494,7 +500,7 @@ void qdisc_watchdog_schedule(struct qdisc_watchdog *wd, psched_time_t expires) &qdisc_root_sleeping(wd->qdisc)->state)) return; - wd->qdisc->flags |= TCQ_F_THROTTLED; + qdisc_throttled(wd->qdisc); time = ktime_set(0, 0); time = ktime_add_ns(time, PSCHED_TICKS2NS(expires)); hrtimer_start(&wd->timer, time, HRTIMER_MODE_ABS); @@ -504,7 +510,7 @@ EXPORT_SYMBOL(qdisc_watchdog_schedule); void qdisc_watchdog_cancel(struct qdisc_watchdog *wd) { hrtimer_cancel(&wd->timer); - wd->qdisc->flags &= ~TCQ_F_THROTTLED; + qdisc_unthrottled(wd->qdisc); } EXPORT_SYMBOL(qdisc_watchdog_cancel); @@ -625,7 +631,7 @@ static u32 qdisc_alloc_handle(struct net_device *dev) autohandle = TC_H_MAKE(0x80000000U, 0); } while (qdisc_lookup(dev, autohandle) && --i > 0); - return i>0 ? autohandle : 0; + return i > 0 ? autohandle : 0; } void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n) @@ -834,7 +840,7 @@ qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue, err = PTR_ERR(stab); goto err_out4; } - sch->stab = stab; + rcu_assign_pointer(sch->stab, stab); } if (tca[TCA_RATE]) { spinlock_t *root_lock; @@ -874,7 +880,7 @@ err_out4: * Any broken qdiscs that would require a ops->reset() here? * The qdisc was never in action so it shouldn't be necessary. */ - qdisc_put_stab(sch->stab); + qdisc_put_stab(rtnl_dereference(sch->stab)); if (ops->destroy) ops->destroy(sch); goto err_out3; @@ -882,7 +888,7 @@ err_out4: static int qdisc_change(struct Qdisc *sch, struct nlattr **tca) { - struct qdisc_size_table *stab = NULL; + struct qdisc_size_table *ostab, *stab = NULL; int err = 0; if (tca[TCA_OPTIONS]) { @@ -899,8 +905,9 @@ static int qdisc_change(struct Qdisc *sch, struct nlattr **tca) return PTR_ERR(stab); } - qdisc_put_stab(sch->stab); - sch->stab = stab; + ostab = rtnl_dereference(sch->stab); + rcu_assign_pointer(sch->stab, stab); + qdisc_put_stab(ostab); if (tca[TCA_RATE]) { /* NB: ignores errors from replace_estimator @@ -915,9 +922,8 @@ out: return 0; } -struct check_loop_arg -{ - struct qdisc_walker w; +struct check_loop_arg { + struct qdisc_walker w; struct Qdisc *p; int depth; }; @@ -970,7 +976,8 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg) struct Qdisc *p = NULL; int err; - if ((dev = __dev_get_by_index(net, tcm->tcm_ifindex)) == NULL) + dev = __dev_get_by_index(net, tcm->tcm_ifindex); + if (!dev) return -ENODEV; err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL); @@ -980,12 +987,12 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg) if (clid) { if (clid != TC_H_ROOT) { if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) { - if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL) + p = qdisc_lookup(dev, TC_H_MAJ(clid)); + if (!p) return -ENOENT; q = qdisc_leaf(p, clid); - } else { /* ingress */ - if (dev_ingress_queue(dev)) - q = dev_ingress_queue(dev)->qdisc_sleeping; + } else if (dev_ingress_queue(dev)) { + q = dev_ingress_queue(dev)->qdisc_sleeping; } } else { q = dev->qdisc; @@ -996,7 +1003,8 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg) if (tcm->tcm_handle && q->handle != tcm->tcm_handle) return -EINVAL; } else { - if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL) + q = qdisc_lookup(dev, tcm->tcm_handle); + if (!q) return -ENOENT; } @@ -1008,7 +1016,8 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg) return -EINVAL; if (q->handle == 0) return -ENOENT; - if ((err = qdisc_graft(dev, p, skb, n, clid, NULL, q)) != 0) + err = qdisc_graft(dev, p, skb, n, clid, NULL, q); + if (err != 0) return err; } else { qdisc_notify(net, skb, n, clid, NULL, q); @@ -1017,7 +1026,7 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg) } /* - Create/change qdisc. + * Create/change qdisc. */ static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg) @@ -1036,7 +1045,8 @@ replay: clid = tcm->tcm_parent; q = p = NULL; - if ((dev = __dev_get_by_index(net, tcm->tcm_ifindex)) == NULL) + dev = __dev_get_by_index(net, tcm->tcm_ifindex); + if (!dev) return -ENODEV; err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL); @@ -1046,12 +1056,12 @@ replay: if (clid) { if (clid != TC_H_ROOT) { if (clid != TC_H_INGRESS) { - if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL) + p = qdisc_lookup(dev, TC_H_MAJ(clid)); + if (!p) return -ENOENT; q = qdisc_leaf(p, clid); - } else { /* ingress */ - if (dev_ingress_queue_create(dev)) - q = dev_ingress_queue(dev)->qdisc_sleeping; + } else if (dev_ingress_queue_create(dev)) { + q = dev_ingress_queue(dev)->qdisc_sleeping; } } else { q = dev->qdisc; @@ -1063,13 +1073,14 @@ replay: if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) { if (tcm->tcm_handle) { - if (q && !(n->nlmsg_flags&NLM_F_REPLACE)) + if (q && !(n->nlmsg_flags & NLM_F_REPLACE)) return -EEXIST; if (TC_H_MIN(tcm->tcm_handle)) return -EINVAL; - if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL) + q = qdisc_lookup(dev, tcm->tcm_handle); + if (!q) goto create_n_graft; - if (n->nlmsg_flags&NLM_F_EXCL) + if (n->nlmsg_flags & NLM_F_EXCL) return -EEXIST; if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) return -EINVAL; @@ -1079,7 +1090,7 @@ replay: atomic_inc(&q->refcnt); goto graft; } else { - if (q == NULL) + if (!q) goto create_n_graft; /* This magic test requires explanation. @@ -1101,9 +1112,9 @@ replay: * For now we select create/graft, if * user gave KIND, which does not match existing. */ - if ((n->nlmsg_flags&NLM_F_CREATE) && - (n->nlmsg_flags&NLM_F_REPLACE) && - ((n->nlmsg_flags&NLM_F_EXCL) || + if ((n->nlmsg_flags & NLM_F_CREATE) && + (n->nlmsg_flags & NLM_F_REPLACE) && + ((n->nlmsg_flags & NLM_F_EXCL) || (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)))) goto create_n_graft; @@ -1118,7 +1129,7 @@ replay: /* Change qdisc parameters */ if (q == NULL) return -ENOENT; - if (n->nlmsg_flags&NLM_F_EXCL) + if (n->nlmsg_flags & NLM_F_EXCL) return -EEXIST; if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) return -EINVAL; @@ -1128,7 +1139,7 @@ replay: return err; create_n_graft: - if (!(n->nlmsg_flags&NLM_F_CREATE)) + if (!(n->nlmsg_flags & NLM_F_CREATE)) return -ENOENT; if (clid == TC_H_INGRESS) { if (dev_ingress_queue(dev)) @@ -1175,6 +1186,7 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, struct nlmsghdr *nlh; unsigned char *b = skb_tail_pointer(skb); struct gnet_dump d; + struct qdisc_size_table *stab; nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags); tcm = NLMSG_DATA(nlh); @@ -1190,7 +1202,8 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, goto nla_put_failure; q->qstats.qlen = q->q.qlen; - if (q->stab && qdisc_dump_stab(skb, q->stab) < 0) + stab = rtnl_dereference(q->stab); + if (stab && qdisc_dump_stab(skb, stab) < 0) goto nla_put_failure; if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS, @@ -1234,16 +1247,19 @@ static int qdisc_notify(struct net *net, struct sk_buff *oskb, return -ENOBUFS; if (old && !tc_qdisc_dump_ignore(old)) { - if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq, 0, RTM_DELQDISC) < 0) + if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq, + 0, RTM_DELQDISC) < 0) goto err_out; } if (new && !tc_qdisc_dump_ignore(new)) { - if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0) + if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq, + old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0) goto err_out; } if (skb->len) - return rtnetlink_send(skb, net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO); + return rtnetlink_send(skb, net, pid, RTNLGRP_TC, + n->nlmsg_flags & NLM_F_ECHO); err_out: kfree_skb(skb); @@ -1275,7 +1291,7 @@ static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb, q_idx++; continue; } - if (!tc_qdisc_dump_ignore(q) && + if (!tc_qdisc_dump_ignore(q) && tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0) goto done; @@ -1356,7 +1372,8 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg) u32 qid = TC_H_MAJ(clid); int err; - if ((dev = __dev_get_by_index(net, tcm->tcm_ifindex)) == NULL) + dev = __dev_get_by_index(net, tcm->tcm_ifindex); + if (!dev) return -ENODEV; err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL); @@ -1391,9 +1408,9 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg) qid = dev->qdisc->handle; /* Now qid is genuine qdisc handle consistent - both with parent and child. - - TC_H_MAJ(pid) still may be unspecified, complete it now. + * both with parent and child. + * + * TC_H_MAJ(pid) still may be unspecified, complete it now. */ if (pid) pid = TC_H_MAKE(qid, pid); @@ -1403,7 +1420,8 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg) } /* OK. Locate qdisc */ - if ((q = qdisc_lookup(dev, qid)) == NULL) + q = qdisc_lookup(dev, qid); + if (!q) return -ENOENT; /* An check that it supports classes */ @@ -1423,13 +1441,14 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg) if (cl == 0) { err = -ENOENT; - if (n->nlmsg_type != RTM_NEWTCLASS || !(n->nlmsg_flags&NLM_F_CREATE)) + if (n->nlmsg_type != RTM_NEWTCLASS || + !(n->nlmsg_flags & NLM_F_CREATE)) goto out; } else { switch (n->nlmsg_type) { case RTM_NEWTCLASS: err = -EEXIST; - if (n->nlmsg_flags&NLM_F_EXCL) + if (n->nlmsg_flags & NLM_F_EXCL) goto out; break; case RTM_DELTCLASS: @@ -1521,14 +1540,14 @@ static int tclass_notify(struct net *net, struct sk_buff *oskb, return -EINVAL; } - return rtnetlink_send(skb, net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO); + return rtnetlink_send(skb, net, pid, RTNLGRP_TC, + n->nlmsg_flags & NLM_F_ECHO); } -struct qdisc_dump_args -{ - struct qdisc_walker w; - struct sk_buff *skb; - struct netlink_callback *cb; +struct qdisc_dump_args { + struct qdisc_walker w; + struct sk_buff *skb; + struct netlink_callback *cb; }; static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg) @@ -1590,7 +1609,7 @@ static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb, static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb) { - struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh); + struct tcmsg *tcm = (struct tcmsg *)NLMSG_DATA(cb->nlh); struct net *net = sock_net(skb->sk); struct netdev_queue *dev_queue; struct net_device *dev; @@ -1598,7 +1617,8 @@ static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb) if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm))) return 0; - if ((dev = dev_get_by_index(net, tcm->tcm_ifindex)) == NULL) + dev = dev_get_by_index(net, tcm->tcm_ifindex); + if (!dev) return 0; s_t = cb->args[0]; @@ -1621,19 +1641,22 @@ done: } /* Main classifier routine: scans classifier chain attached - to this qdisc, (optionally) tests for protocol and asks - specific classifiers. + * to this qdisc, (optionally) tests for protocol and asks + * specific classifiers. */ int tc_classify_compat(struct sk_buff *skb, struct tcf_proto *tp, struct tcf_result *res) { __be16 protocol = skb->protocol; - int err = 0; + int err; for (; tp; tp = tp->next) { - if ((tp->protocol == protocol || - tp->protocol == htons(ETH_P_ALL)) && - (err = tp->classify(skb, tp, res)) >= 0) { + if (tp->protocol != protocol && + tp->protocol != htons(ETH_P_ALL)) + continue; + err = tp->classify(skb, tp, res); + + if (err >= 0) { #ifdef CONFIG_NET_CLS_ACT if (err != TC_ACT_RECLASSIFY && skb->tc_verd) skb->tc_verd = SET_TC_VERD(skb->tc_verd, 0); @@ -1664,11 +1687,11 @@ reclassify: if (verd++ >= MAX_REC_LOOP) { if (net_ratelimit()) - printk(KERN_NOTICE - "%s: packet reclassify loop" + pr_notice("%s: packet reclassify loop" " rule prio %u protocol %02x\n", - tp->q->ops->id, - tp->prio & 0xffff, ntohs(tp->protocol)); + tp->q->ops->id, + tp->prio & 0xffff, + ntohs(tp->protocol)); return TC_ACT_SHOT; } skb->tc_verd = SET_TC_VERD(skb->tc_verd, verd); @@ -1761,7 +1784,7 @@ static int __init pktsched_init(void) err = register_pernet_subsys(&psched_net_ops); if (err) { - printk(KERN_ERR "pktsched_init: " + pr_err("pktsched_init: " "cannot initialize per netns operations\n"); return err; } diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c index 943d733409d..3f08158b868 100644 --- a/net/sched/sch_atm.c +++ b/net/sched/sch_atm.c @@ -319,7 +319,7 @@ static int atm_tc_delete(struct Qdisc *sch, unsigned long arg) * creation), and one for the reference held when calling delete. */ if (flow->ref < 2) { - printk(KERN_ERR "atm_tc_delete: flow->ref == %d\n", flow->ref); + pr_err("atm_tc_delete: flow->ref == %d\n", flow->ref); return -EINVAL; } if (flow->ref > 2) @@ -384,12 +384,12 @@ static int atm_tc_enqueue(struct sk_buff *skb, struct Qdisc *sch) } } flow = NULL; - done: - ; +done: + ; } - if (!flow) + if (!flow) { flow = &p->link; - else { + } else { if (flow->vcc) ATM_SKB(skb)->atm_options = flow->vcc->atm_options; /*@@@ looks good ... but it's not supposed to work :-) */ @@ -576,8 +576,7 @@ static void atm_tc_destroy(struct Qdisc *sch) list_for_each_entry_safe(flow, tmp, &p->flows, list) { if (flow->ref > 1) - printk(KERN_ERR "atm_destroy: %p->ref = %d\n", flow, - flow->ref); + pr_err("atm_destroy: %p->ref = %d\n", flow, flow->ref); atm_tc_put(sch, (unsigned long)flow); } tasklet_kill(&p->task); @@ -616,9 +615,8 @@ static int atm_tc_dump_class(struct Qdisc *sch, unsigned long cl, } if (flow->excess) NLA_PUT_U32(skb, TCA_ATM_EXCESS, flow->classid); - else { + else NLA_PUT_U32(skb, TCA_ATM_EXCESS, 0); - } nla_nest_end(skb, nest); return skb->len; diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index c80d1c210c5..24d94c097b3 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -72,8 +72,7 @@ struct cbq_sched_data; -struct cbq_class -{ +struct cbq_class { struct Qdisc_class_common common; struct cbq_class *next_alive; /* next class with backlog in this priority band */ @@ -139,19 +138,18 @@ struct cbq_class int refcnt; int filters; - struct cbq_class *defaults[TC_PRIO_MAX+1]; + struct cbq_class *defaults[TC_PRIO_MAX + 1]; }; -struct cbq_sched_data -{ +struct cbq_sched_data { struct Qdisc_class_hash clhash; /* Hash table of all classes */ - int nclasses[TC_CBQ_MAXPRIO+1]; - unsigned quanta[TC_CBQ_MAXPRIO+1]; + int nclasses[TC_CBQ_MAXPRIO + 1]; + unsigned int quanta[TC_CBQ_MAXPRIO + 1]; struct cbq_class link; - unsigned activemask; - struct cbq_class *active[TC_CBQ_MAXPRIO+1]; /* List of all classes + unsigned int activemask; + struct cbq_class *active[TC_CBQ_MAXPRIO + 1]; /* List of all classes with backlog */ #ifdef CONFIG_NET_CLS_ACT @@ -162,7 +160,7 @@ struct cbq_sched_data int tx_len; psched_time_t now; /* Cached timestamp */ psched_time_t now_rt; /* Cached real time */ - unsigned pmask; + unsigned int pmask; struct hrtimer delay_timer; struct qdisc_watchdog watchdog; /* Watchdog timer, @@ -175,9 +173,9 @@ struct cbq_sched_data }; -#define L2T(cl,len) qdisc_l2t((cl)->R_tab,len) +#define L2T(cl, len) qdisc_l2t((cl)->R_tab, len) -static __inline__ struct cbq_class * +static inline struct cbq_class * cbq_class_lookup(struct cbq_sched_data *q, u32 classid) { struct Qdisc_class_common *clc; @@ -193,25 +191,27 @@ cbq_class_lookup(struct cbq_sched_data *q, u32 classid) static struct cbq_class * cbq_reclassify(struct sk_buff *skb, struct cbq_class *this) { - struct cbq_class *cl, *new; + struct cbq_class *cl; - for (cl = this->tparent; cl; cl = cl->tparent) - if ((new = cl->defaults[TC_PRIO_BESTEFFORT]) != NULL && new != this) - return new; + for (cl = this->tparent; cl; cl = cl->tparent) { + struct cbq_class *new = cl->defaults[TC_PRIO_BESTEFFORT]; + if (new != NULL && new != this) + return new; + } return NULL; } #endif /* Classify packet. The procedure is pretty complicated, but - it allows us to combine link sharing and priority scheduling - transparently. - - Namely, you can put link sharing rules (f.e. route based) at root of CBQ, - so that it resolves to split nodes. Then packets are classified - by logical priority, or a more specific classifier may be attached - to the split node. + * it allows us to combine link sharing and priority scheduling + * transparently. + * + * Namely, you can put link sharing rules (f.e. route based) at root of CBQ, + * so that it resolves to split nodes. Then packets are classified + * by logical priority, or a more specific classifier may be attached + * to the split node. */ static struct cbq_class * @@ -227,7 +227,7 @@ cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) /* * Step 1. If skb->priority points to one of our classes, use it. */ - if (TC_H_MAJ(prio^sch->handle) == 0 && + if (TC_H_MAJ(prio ^ sch->handle) == 0 && (cl = cbq_class_lookup(q, prio)) != NULL) return cl; @@ -243,10 +243,11 @@ cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) (result = tc_classify_compat(skb, head->filter_list, &res)) < 0) goto fallback; - if ((cl = (void*)res.class) == NULL) { + cl = (void *)res.class; + if (!cl) { if (TC_H_MAJ(res.classid)) cl = cbq_class_lookup(q, res.classid); - else if ((cl = defmap[res.classid&TC_PRIO_MAX]) == NULL) + else if ((cl = defmap[res.classid & TC_PRIO_MAX]) == NULL) cl = defmap[TC_PRIO_BESTEFFORT]; if (cl == NULL || cl->level >= head->level) @@ -282,7 +283,7 @@ fallback: * Step 4. No success... */ if (TC_H_MAJ(prio) == 0 && - !(cl = head->defaults[prio&TC_PRIO_MAX]) && + !(cl = head->defaults[prio & TC_PRIO_MAX]) && !(cl = head->defaults[TC_PRIO_BESTEFFORT])) return head; @@ -290,12 +291,12 @@ fallback: } /* - A packet has just been enqueued on the empty class. - cbq_activate_class adds it to the tail of active class list - of its priority band. + * A packet has just been enqueued on the empty class. + * cbq_activate_class adds it to the tail of active class list + * of its priority band. */ -static __inline__ void cbq_activate_class(struct cbq_class *cl) +static inline void cbq_activate_class(struct cbq_class *cl) { struct cbq_sched_data *q = qdisc_priv(cl->qdisc); int prio = cl->cpriority; @@ -314,9 +315,9 @@ static __inline__ void cbq_activate_class(struct cbq_class *cl) } /* - Unlink class from active chain. - Note that this same procedure is done directly in cbq_dequeue* - during round-robin procedure. + * Unlink class from active chain. + * Note that this same procedure is done directly in cbq_dequeue* + * during round-robin procedure. */ static void cbq_deactivate_class(struct cbq_class *this) @@ -350,7 +351,7 @@ cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl) { int toplevel = q->toplevel; - if (toplevel > cl->level && !(cl->q->flags&TCQ_F_THROTTLED)) { + if (toplevel > cl->level && !(qdisc_is_throttled(cl->q))) { psched_time_t now; psched_tdiff_t incr; @@ -363,7 +364,7 @@ cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl) q->toplevel = cl->level; return; } - } while ((cl=cl->borrow) != NULL && toplevel > cl->level); + } while ((cl = cl->borrow) != NULL && toplevel > cl->level); } } @@ -390,7 +391,6 @@ cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch) ret = qdisc_enqueue(skb, cl->q); if (ret == NET_XMIT_SUCCESS) { sch->q.qlen++; - qdisc_bstats_update(sch, skb); cbq_mark_toplevel(q, cl); if (!cl->next_alive) cbq_activate_class(cl); @@ -418,11 +418,11 @@ static void cbq_ovl_classic(struct cbq_class *cl) delay += cl->offtime; /* - Class goes to sleep, so that it will have no - chance to work avgidle. Let's forgive it 8) - - BTW cbq-2.0 has a crap in this - place, apparently they forgot to shift it by cl->ewma_log. + * Class goes to sleep, so that it will have no + * chance to work avgidle. Let's forgive it 8) + * + * BTW cbq-2.0 has a crap in this + * place, apparently they forgot to shift it by cl->ewma_log. */ if (cl->avgidle < 0) delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log); @@ -439,8 +439,8 @@ static void cbq_ovl_classic(struct cbq_class *cl) q->wd_expires = delay; /* Dirty work! We must schedule wakeups based on - real available rate, rather than leaf rate, - which may be tiny (even zero). + * real available rate, rather than leaf rate, + * which may be tiny (even zero). */ if (q->toplevel == TC_CBQ_MAXLEVEL) { struct cbq_class *b; @@ -460,7 +460,7 @@ static void cbq_ovl_classic(struct cbq_class *cl) } /* TC_CBQ_OVL_RCLASSIC: penalize by offtime classes in hierarchy, when - they go overlimit + * they go overlimit */ static void cbq_ovl_rclassic(struct cbq_class *cl) @@ -595,7 +595,7 @@ static enum hrtimer_restart cbq_undelay(struct hrtimer *timer) struct Qdisc *sch = q->watchdog.qdisc; psched_time_t now; psched_tdiff_t delay = 0; - unsigned pmask; + unsigned int pmask; now = psched_get_time(); @@ -624,7 +624,7 @@ static enum hrtimer_restart cbq_undelay(struct hrtimer *timer) hrtimer_start(&q->delay_timer, time, HRTIMER_MODE_ABS); } - sch->flags &= ~TCQ_F_THROTTLED; + qdisc_unthrottled(sch); __netif_schedule(qdisc_root(sch)); return HRTIMER_NORESTART; } @@ -649,7 +649,6 @@ static int cbq_reshape_fail(struct sk_buff *skb, struct Qdisc *child) ret = qdisc_enqueue(skb, cl->q); if (ret == NET_XMIT_SUCCESS) { sch->q.qlen++; - qdisc_bstats_update(sch, skb); if (!cl->next_alive) cbq_activate_class(cl); return 0; @@ -665,15 +664,15 @@ static int cbq_reshape_fail(struct sk_buff *skb, struct Qdisc *child) #endif /* - It is mission critical procedure. - - We "regenerate" toplevel cutoff, if transmitting class - has backlog and it is not regulated. It is not part of - original CBQ description, but looks more reasonable. - Probably, it is wrong. This question needs further investigation. -*/ + * It is mission critical procedure. + * + * We "regenerate" toplevel cutoff, if transmitting class + * has backlog and it is not regulated. It is not part of + * original CBQ description, but looks more reasonable. + * Probably, it is wrong. This question needs further investigation. + */ -static __inline__ void +static inline void cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl, struct cbq_class *borrowed) { @@ -684,7 +683,7 @@ cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl, q->toplevel = borrowed->level; return; } - } while ((borrowed=borrowed->borrow) != NULL); + } while ((borrowed = borrowed->borrow) != NULL); } #if 0 /* It is not necessary now. Uncommenting it @@ -712,10 +711,10 @@ cbq_update(struct cbq_sched_data *q) cl->bstats.bytes += len; /* - (now - last) is total time between packet right edges. - (last_pktlen/rate) is "virtual" busy time, so that - - idle = (now - last) - last_pktlen/rate + * (now - last) is total time between packet right edges. + * (last_pktlen/rate) is "virtual" busy time, so that + * + * idle = (now - last) - last_pktlen/rate */ idle = q->now - cl->last; @@ -725,9 +724,9 @@ cbq_update(struct cbq_sched_data *q) idle -= L2T(cl, len); /* true_avgidle := (1-W)*true_avgidle + W*idle, - where W=2^{-ewma_log}. But cl->avgidle is scaled: - cl->avgidle == true_avgidle/W, - hence: + * where W=2^{-ewma_log}. But cl->avgidle is scaled: + * cl->avgidle == true_avgidle/W, + * hence: */ avgidle += idle - (avgidle>>cl->ewma_log); } @@ -741,22 +740,22 @@ cbq_update(struct cbq_sched_data *q) cl->avgidle = avgidle; /* Calculate expected time, when this class - will be allowed to send. - It will occur, when: - (1-W)*true_avgidle + W*delay = 0, i.e. - idle = (1/W - 1)*(-true_avgidle) - or - idle = (1 - W)*(-cl->avgidle); + * will be allowed to send. + * It will occur, when: + * (1-W)*true_avgidle + W*delay = 0, i.e. + * idle = (1/W - 1)*(-true_avgidle) + * or + * idle = (1 - W)*(-cl->avgidle); */ idle = (-avgidle) - ((-avgidle) >> cl->ewma_log); /* - That is not all. - To maintain the rate allocated to the class, - we add to undertime virtual clock, - necessary to complete transmitted packet. - (len/phys_bandwidth has been already passed - to the moment of cbq_update) + * That is not all. + * To maintain the rate allocated to the class, + * we add to undertime virtual clock, + * necessary to complete transmitted packet. + * (len/phys_bandwidth has been already passed + * to the moment of cbq_update) */ idle -= L2T(&q->link, len); @@ -778,7 +777,7 @@ cbq_update(struct cbq_sched_data *q) cbq_update_toplevel(q, this, q->tx_borrowed); } -static __inline__ struct cbq_class * +static inline struct cbq_class * cbq_under_limit(struct cbq_class *cl) { struct cbq_sched_data *q = qdisc_priv(cl->qdisc); @@ -794,16 +793,17 @@ cbq_under_limit(struct cbq_class *cl) do { /* It is very suspicious place. Now overlimit - action is generated for not bounded classes - only if link is completely congested. - Though it is in agree with ancestor-only paradigm, - it looks very stupid. Particularly, - it means that this chunk of code will either - never be called or result in strong amplification - of burstiness. Dangerous, silly, and, however, - no another solution exists. + * action is generated for not bounded classes + * only if link is completely congested. + * Though it is in agree with ancestor-only paradigm, + * it looks very stupid. Particularly, + * it means that this chunk of code will either + * never be called or result in strong amplification + * of burstiness. Dangerous, silly, and, however, + * no another solution exists. */ - if ((cl = cl->borrow) == NULL) { + cl = cl->borrow; + if (!cl) { this_cl->qstats.overlimits++; this_cl->overlimit(this_cl); return NULL; @@ -816,7 +816,7 @@ cbq_under_limit(struct cbq_class *cl) return cl; } -static __inline__ struct sk_buff * +static inline struct sk_buff * cbq_dequeue_prio(struct Qdisc *sch, int prio) { struct cbq_sched_data *q = qdisc_priv(sch); @@ -840,7 +840,7 @@ cbq_dequeue_prio(struct Qdisc *sch, int prio) if (cl->deficit <= 0) { /* Class exhausted its allotment per - this round. Switch to the next one. + * this round. Switch to the next one. */ deficit = 1; cl->deficit += cl->quantum; @@ -850,8 +850,8 @@ cbq_dequeue_prio(struct Qdisc *sch, int prio) skb = cl->q->dequeue(cl->q); /* Class did not give us any skb :-( - It could occur even if cl->q->q.qlen != 0 - f.e. if cl->q == "tbf" + * It could occur even if cl->q->q.qlen != 0 + * f.e. if cl->q == "tbf" */ if (skb == NULL) goto skip_class; @@ -880,7 +880,7 @@ cbq_dequeue_prio(struct Qdisc *sch, int prio) skip_class: if (cl->q->q.qlen == 0 || prio != cl->cpriority) { /* Class is empty or penalized. - Unlink it from active chain. + * Unlink it from active chain. */ cl_prev->next_alive = cl->next_alive; cl->next_alive = NULL; @@ -919,14 +919,14 @@ next_class: return NULL; } -static __inline__ struct sk_buff * +static inline struct sk_buff * cbq_dequeue_1(struct Qdisc *sch) { struct cbq_sched_data *q = qdisc_priv(sch); struct sk_buff *skb; - unsigned activemask; + unsigned int activemask; - activemask = q->activemask&0xFF; + activemask = q->activemask & 0xFF; while (activemask) { int prio = ffz(~activemask); activemask &= ~(1<<prio); @@ -951,11 +951,11 @@ cbq_dequeue(struct Qdisc *sch) if (q->tx_class) { psched_tdiff_t incr2; /* Time integrator. We calculate EOS time - by adding expected packet transmission time. - If real time is greater, we warp artificial clock, - so that: - - cbq_time = max(real_time, work); + * by adding expected packet transmission time. + * If real time is greater, we warp artificial clock, + * so that: + * + * cbq_time = max(real_time, work); */ incr2 = L2T(&q->link, q->tx_len); q->now += incr2; @@ -971,28 +971,29 @@ cbq_dequeue(struct Qdisc *sch) skb = cbq_dequeue_1(sch); if (skb) { + qdisc_bstats_update(sch, skb); sch->q.qlen--; - sch->flags &= ~TCQ_F_THROTTLED; + qdisc_unthrottled(sch); return skb; } /* All the classes are overlimit. - - It is possible, if: - - 1. Scheduler is empty. - 2. Toplevel cutoff inhibited borrowing. - 3. Root class is overlimit. - - Reset 2d and 3d conditions and retry. - - Note, that NS and cbq-2.0 are buggy, peeking - an arbitrary class is appropriate for ancestor-only - sharing, but not for toplevel algorithm. - - Our version is better, but slower, because it requires - two passes, but it is unavoidable with top-level sharing. - */ + * + * It is possible, if: + * + * 1. Scheduler is empty. + * 2. Toplevel cutoff inhibited borrowing. + * 3. Root class is overlimit. + * + * Reset 2d and 3d conditions and retry. + * + * Note, that NS and cbq-2.0 are buggy, peeking + * an arbitrary class is appropriate for ancestor-only + * sharing, but not for toplevel algorithm. + * + * Our version is better, but slower, because it requires + * two passes, but it is unavoidable with top-level sharing. + */ if (q->toplevel == TC_CBQ_MAXLEVEL && q->link.undertime == PSCHED_PASTPERFECT) @@ -1003,7 +1004,8 @@ cbq_dequeue(struct Qdisc *sch) } /* No packets in scheduler or nobody wants to give them to us :-( - Sigh... start watchdog timer in the last case. */ + * Sigh... start watchdog timer in the last case. + */ if (sch->q.qlen) { sch->qstats.overlimits++; @@ -1025,13 +1027,14 @@ static void cbq_adjust_levels(struct cbq_class *this) int level = 0; struct cbq_class *cl; - if ((cl = this->children) != NULL) { + cl = this->children; + if (cl) { do { if (cl->level > level) level = cl->level; } while ((cl = cl->sibling) != this->children); } - this->level = level+1; + this->level = level + 1; } while ((this = this->tparent) != NULL); } @@ -1047,14 +1050,15 @@ static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio) for (h = 0; h < q->clhash.hashsize; h++) { hlist_for_each_entry(cl, n, &q->clhash.hash[h], common.hnode) { /* BUGGGG... Beware! This expression suffer of - arithmetic overflows! + * arithmetic overflows! */ if (cl->priority == prio) { cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/ q->quanta[prio]; } if (cl->quantum <= 0 || cl->quantum>32*qdisc_dev(cl->qdisc)->mtu) { - printk(KERN_WARNING "CBQ: class %08x has bad quantum==%ld, repaired.\n", cl->common.classid, cl->quantum); + pr_warning("CBQ: class %08x has bad quantum==%ld, repaired.\n", + cl->common.classid, cl->quantum); cl->quantum = qdisc_dev(cl->qdisc)->mtu/2 + 1; } } @@ -1065,18 +1069,18 @@ static void cbq_sync_defmap(struct cbq_class *cl) { struct cbq_sched_data *q = qdisc_priv(cl->qdisc); struct cbq_class *split = cl->split; - unsigned h; + unsigned int h; int i; if (split == NULL) return; - for (i=0; i<=TC_PRIO_MAX; i++) { - if (split->defaults[i] == cl && !(cl->defmap&(1<<i))) + for (i = 0; i <= TC_PRIO_MAX; i++) { + if (split->defaults[i] == cl && !(cl->defmap & (1<<i))) split->defaults[i] = NULL; } - for (i=0; i<=TC_PRIO_MAX; i++) { + for (i = 0; i <= TC_PRIO_MAX; i++) { int level = split->level; if (split->defaults[i]) @@ -1089,7 +1093,7 @@ static void cbq_sync_defmap(struct cbq_class *cl) hlist_for_each_entry(c, n, &q->clhash.hash[h], common.hnode) { if (c->split == split && c->level < level && - c->defmap&(1<<i)) { + c->defmap & (1<<i)) { split->defaults[i] = c; level = c->level; } @@ -1103,7 +1107,8 @@ static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 ma struct cbq_class *split = NULL; if (splitid == 0) { - if ((split = cl->split) == NULL) + split = cl->split; + if (!split) return; splitid = split->common.classid; } @@ -1121,9 +1126,9 @@ static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 ma cl->defmap = 0; cbq_sync_defmap(cl); cl->split = split; - cl->defmap = def&mask; + cl->defmap = def & mask; } else - cl->defmap = (cl->defmap&~mask)|(def&mask); + cl->defmap = (cl->defmap & ~mask) | (def & mask); cbq_sync_defmap(cl); } @@ -1136,7 +1141,7 @@ static void cbq_unlink_class(struct cbq_class *this) qdisc_class_hash_remove(&q->clhash, &this->common); if (this->tparent) { - clp=&this->sibling; + clp = &this->sibling; cl = *clp; do { if (cl == this) { @@ -1175,7 +1180,7 @@ static void cbq_link_class(struct cbq_class *this) } } -static unsigned int cbq_drop(struct Qdisc* sch) +static unsigned int cbq_drop(struct Qdisc *sch) { struct cbq_sched_data *q = qdisc_priv(sch); struct cbq_class *cl, *cl_head; @@ -1183,7 +1188,8 @@ static unsigned int cbq_drop(struct Qdisc* sch) unsigned int len; for (prio = TC_CBQ_MAXPRIO; prio >= 0; prio--) { - if ((cl_head = q->active[prio]) == NULL) + cl_head = q->active[prio]; + if (!cl_head) continue; cl = cl_head; @@ -1200,13 +1206,13 @@ static unsigned int cbq_drop(struct Qdisc* sch) } static void -cbq_reset(struct Qdisc* sch) +cbq_reset(struct Qdisc *sch) { struct cbq_sched_data *q = qdisc_priv(sch); struct cbq_class *cl; struct hlist_node *n; int prio; - unsigned h; + unsigned int h; q->activemask = 0; q->pmask = 0; @@ -1238,21 +1244,21 @@ cbq_reset(struct Qdisc* sch) static int cbq_set_lss(struct cbq_class *cl, struct tc_cbq_lssopt *lss) { - if (lss->change&TCF_CBQ_LSS_FLAGS) { - cl->share = (lss->flags&TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent; - cl->borrow = (lss->flags&TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent; + if (lss->change & TCF_CBQ_LSS_FLAGS) { + cl->share = (lss->flags & TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent; + cl->borrow = (lss->flags & TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent; } - if (lss->change&TCF_CBQ_LSS_EWMA) + if (lss->change & TCF_CBQ_LSS_EWMA) cl->ewma_log = lss->ewma_log; - if (lss->change&TCF_CBQ_LSS_AVPKT) + if (lss->change & TCF_CBQ_LSS_AVPKT) cl->avpkt = lss->avpkt; - if (lss->change&TCF_CBQ_LSS_MINIDLE) + if (lss->change & TCF_CBQ_LSS_MINIDLE) cl->minidle = -(long)lss->minidle; - if (lss->change&TCF_CBQ_LSS_MAXIDLE) { + if (lss->change & TCF_CBQ_LSS_MAXIDLE) { cl->maxidle = lss->maxidle; cl->avgidle = lss->maxidle; } - if (lss->change&TCF_CBQ_LSS_OFFTIME) + if (lss->change & TCF_CBQ_LSS_OFFTIME) cl->offtime = lss->offtime; return 0; } @@ -1280,10 +1286,10 @@ static int cbq_set_wrr(struct cbq_class *cl, struct tc_cbq_wrropt *wrr) if (wrr->weight) cl->weight = wrr->weight; if (wrr->priority) { - cl->priority = wrr->priority-1; + cl->priority = wrr->priority - 1; cl->cpriority = cl->priority; if (cl->priority >= cl->priority2) - cl->priority2 = TC_CBQ_MAXPRIO-1; + cl->priority2 = TC_CBQ_MAXPRIO - 1; } cbq_addprio(q, cl); @@ -1300,10 +1306,10 @@ static int cbq_set_overlimit(struct cbq_class *cl, struct tc_cbq_ovl *ovl) cl->overlimit = cbq_ovl_delay; break; case TC_CBQ_OVL_LOWPRIO: - if (ovl->priority2-1 >= TC_CBQ_MAXPRIO || - ovl->priority2-1 <= cl->priority) + if (ovl->priority2 - 1 >= TC_CBQ_MAXPRIO || + ovl->priority2 - 1 <= cl->priority) return -EINVAL; - cl->priority2 = ovl->priority2-1; + cl->priority2 = ovl->priority2 - 1; cl->overlimit = cbq_ovl_lowprio; break; case TC_CBQ_OVL_DROP: @@ -1382,9 +1388,9 @@ static int cbq_init(struct Qdisc *sch, struct nlattr *opt) if (!q->link.q) q->link.q = &noop_qdisc; - q->link.priority = TC_CBQ_MAXPRIO-1; - q->link.priority2 = TC_CBQ_MAXPRIO-1; - q->link.cpriority = TC_CBQ_MAXPRIO-1; + q->link.priority = TC_CBQ_MAXPRIO - 1; + q->link.priority2 = TC_CBQ_MAXPRIO - 1; + q->link.cpriority = TC_CBQ_MAXPRIO - 1; q->link.ovl_strategy = TC_CBQ_OVL_CLASSIC; q->link.overlimit = cbq_ovl_classic; q->link.allot = psched_mtu(qdisc_dev(sch)); @@ -1415,7 +1421,7 @@ put_rtab: return err; } -static __inline__ int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl) +static int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl) { unsigned char *b = skb_tail_pointer(skb); @@ -1427,7 +1433,7 @@ nla_put_failure: return -1; } -static __inline__ int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl) +static int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl) { unsigned char *b = skb_tail_pointer(skb); struct tc_cbq_lssopt opt; @@ -1452,15 +1458,15 @@ nla_put_failure: return -1; } -static __inline__ int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl) +static int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl) { unsigned char *b = skb_tail_pointer(skb); struct tc_cbq_wrropt opt; opt.flags = 0; opt.allot = cl->allot; - opt.priority = cl->priority+1; - opt.cpriority = cl->cpriority+1; + opt.priority = cl->priority + 1; + opt.cpriority = cl->cpriority + 1; opt.weight = cl->weight; NLA_PUT(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt); return skb->len; @@ -1470,13 +1476,13 @@ nla_put_failure: return -1; } -static __inline__ int cbq_dump_ovl(struct sk_buff *skb, struct cbq_class *cl) +static int cbq_dump_ovl(struct sk_buff *skb, struct cbq_class *cl) { unsigned char *b = skb_tail_pointer(skb); struct tc_cbq_ovl opt; opt.strategy = cl->ovl_strategy; - opt.priority2 = cl->priority2+1; + opt.priority2 = cl->priority2 + 1; opt.pad = 0; opt.penalty = cl->penalty; NLA_PUT(skb, TCA_CBQ_OVL_STRATEGY, sizeof(opt), &opt); @@ -1487,7 +1493,7 @@ nla_put_failure: return -1; } -static __inline__ int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl) +static int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl) { unsigned char *b = skb_tail_pointer(skb); struct tc_cbq_fopt opt; @@ -1506,7 +1512,7 @@ nla_put_failure: } #ifdef CONFIG_NET_CLS_ACT -static __inline__ int cbq_dump_police(struct sk_buff *skb, struct cbq_class *cl) +static int cbq_dump_police(struct sk_buff *skb, struct cbq_class *cl) { unsigned char *b = skb_tail_pointer(skb); struct tc_cbq_police opt; @@ -1570,7 +1576,7 @@ static int cbq_dump_class(struct Qdisc *sch, unsigned long arg, struct sk_buff *skb, struct tcmsg *tcm) { - struct cbq_class *cl = (struct cbq_class*)arg; + struct cbq_class *cl = (struct cbq_class *)arg; struct nlattr *nest; if (cl->tparent) @@ -1598,7 +1604,7 @@ cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d) { struct cbq_sched_data *q = qdisc_priv(sch); - struct cbq_class *cl = (struct cbq_class*)arg; + struct cbq_class *cl = (struct cbq_class *)arg; cl->qstats.qlen = cl->q->q.qlen; cl->xstats.avgidle = cl->avgidle; @@ -1618,7 +1624,7 @@ cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg, static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, struct Qdisc **old) { - struct cbq_class *cl = (struct cbq_class*)arg; + struct cbq_class *cl = (struct cbq_class *)arg; if (new == NULL) { new = qdisc_create_dflt(sch->dev_queue, @@ -1641,10 +1647,9 @@ static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, return 0; } -static struct Qdisc * -cbq_leaf(struct Qdisc *sch, unsigned long arg) +static struct Qdisc *cbq_leaf(struct Qdisc *sch, unsigned long arg) { - struct cbq_class *cl = (struct cbq_class*)arg; + struct cbq_class *cl = (struct cbq_class *)arg; return cl->q; } @@ -1683,13 +1688,12 @@ static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl) kfree(cl); } -static void -cbq_destroy(struct Qdisc* sch) +static void cbq_destroy(struct Qdisc *sch) { struct cbq_sched_data *q = qdisc_priv(sch); struct hlist_node *n, *next; struct cbq_class *cl; - unsigned h; + unsigned int h; #ifdef CONFIG_NET_CLS_ACT q->rx_class = NULL; @@ -1713,7 +1717,7 @@ cbq_destroy(struct Qdisc* sch) static void cbq_put(struct Qdisc *sch, unsigned long arg) { - struct cbq_class *cl = (struct cbq_class*)arg; + struct cbq_class *cl = (struct cbq_class *)arg; if (--cl->refcnt == 0) { #ifdef CONFIG_NET_CLS_ACT @@ -1736,7 +1740,7 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t { int err; struct cbq_sched_data *q = qdisc_priv(sch); - struct cbq_class *cl = (struct cbq_class*)*arg; + struct cbq_class *cl = (struct cbq_class *)*arg; struct nlattr *opt = tca[TCA_OPTIONS]; struct nlattr *tb[TCA_CBQ_MAX + 1]; struct cbq_class *parent; @@ -1828,13 +1832,14 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t if (classid) { err = -EINVAL; - if (TC_H_MAJ(classid^sch->handle) || cbq_class_lookup(q, classid)) + if (TC_H_MAJ(classid ^ sch->handle) || + cbq_class_lookup(q, classid)) goto failure; } else { int i; - classid = TC_H_MAKE(sch->handle,0x8000); + classid = TC_H_MAKE(sch->handle, 0x8000); - for (i=0; i<0x8000; i++) { + for (i = 0; i < 0x8000; i++) { if (++q->hgenerator >= 0x8000) q->hgenerator = 1; if (cbq_class_lookup(q, classid|q->hgenerator) == NULL) @@ -1891,11 +1896,11 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t cl->minidle = -0x7FFFFFFF; cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT])); cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT])); - if (cl->ewma_log==0) + if (cl->ewma_log == 0) cl->ewma_log = q->link.ewma_log; - if (cl->maxidle==0) + if (cl->maxidle == 0) cl->maxidle = q->link.maxidle; - if (cl->avpkt==0) + if (cl->avpkt == 0) cl->avpkt = q->link.avpkt; cl->overlimit = cbq_ovl_classic; if (tb[TCA_CBQ_OVL_STRATEGY]) @@ -1921,7 +1926,7 @@ failure: static int cbq_delete(struct Qdisc *sch, unsigned long arg) { struct cbq_sched_data *q = qdisc_priv(sch); - struct cbq_class *cl = (struct cbq_class*)arg; + struct cbq_class *cl = (struct cbq_class *)arg; unsigned int qlen; if (cl->filters || cl->children || cl == &q->link) @@ -1979,7 +1984,7 @@ static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent, u32 classid) { struct cbq_sched_data *q = qdisc_priv(sch); - struct cbq_class *p = (struct cbq_class*)parent; + struct cbq_class *p = (struct cbq_class *)parent; struct cbq_class *cl = cbq_class_lookup(q, classid); if (cl) { @@ -1993,7 +1998,7 @@ static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent, static void cbq_unbind_filter(struct Qdisc *sch, unsigned long arg) { - struct cbq_class *cl = (struct cbq_class*)arg; + struct cbq_class *cl = (struct cbq_class *)arg; cl->filters--; } @@ -2003,7 +2008,7 @@ static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg) struct cbq_sched_data *q = qdisc_priv(sch); struct cbq_class *cl; struct hlist_node *n; - unsigned h; + unsigned int h; if (arg->stop) return; diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c index de55e642eaf..6b7fe4a84f1 100644 --- a/net/sched/sch_drr.c +++ b/net/sched/sch_drr.c @@ -376,7 +376,6 @@ static int drr_enqueue(struct sk_buff *skb, struct Qdisc *sch) } bstats_update(&cl->bstats, skb); - qdisc_bstats_update(sch, skb); sch->q.qlen++; return err; @@ -403,6 +402,7 @@ static struct sk_buff *drr_dequeue(struct Qdisc *sch) skb = qdisc_dequeue_peeked(cl->qdisc); if (cl->qdisc->q.qlen == 0) list_del(&cl->alist); + qdisc_bstats_update(sch, skb); sch->q.qlen--; return skb; } diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c index 60f4bdd4408..2c790204d04 100644 --- a/net/sched/sch_dsmark.c +++ b/net/sched/sch_dsmark.c @@ -137,10 +137,10 @@ static int dsmark_change(struct Qdisc *sch, u32 classid, u32 parent, mask = nla_get_u8(tb[TCA_DSMARK_MASK]); if (tb[TCA_DSMARK_VALUE]) - p->value[*arg-1] = nla_get_u8(tb[TCA_DSMARK_VALUE]); + p->value[*arg - 1] = nla_get_u8(tb[TCA_DSMARK_VALUE]); if (tb[TCA_DSMARK_MASK]) - p->mask[*arg-1] = mask; + p->mask[*arg - 1] = mask; err = 0; @@ -155,8 +155,8 @@ static int dsmark_delete(struct Qdisc *sch, unsigned long arg) if (!dsmark_valid_index(p, arg)) return -EINVAL; - p->mask[arg-1] = 0xff; - p->value[arg-1] = 0; + p->mask[arg - 1] = 0xff; + p->value[arg - 1] = 0; return 0; } @@ -175,7 +175,7 @@ static void dsmark_walk(struct Qdisc *sch, struct qdisc_walker *walker) if (p->mask[i] == 0xff && !p->value[i]) goto ignore; if (walker->count >= walker->skip) { - if (walker->fn(sch, i+1, walker) < 0) { + if (walker->fn(sch, i + 1, walker) < 0) { walker->stop = 1; break; } @@ -260,7 +260,6 @@ static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch) return err; } - qdisc_bstats_update(sch, skb); sch->q.qlen++; return NET_XMIT_SUCCESS; @@ -283,6 +282,7 @@ static struct sk_buff *dsmark_dequeue(struct Qdisc *sch) if (skb == NULL) return NULL; + qdisc_bstats_update(sch, skb); sch->q.qlen--; index = skb->tc_index & (p->indices - 1); @@ -304,9 +304,8 @@ static struct sk_buff *dsmark_dequeue(struct Qdisc *sch) * and don't need yet another qdisc as a bypass. */ if (p->mask[index] != 0xff || p->value[index]) - printk(KERN_WARNING - "dsmark_dequeue: unsupported protocol %d\n", - ntohs(skb->protocol)); + pr_warning("dsmark_dequeue: unsupported protocol %d\n", + ntohs(skb->protocol)); break; } @@ -424,14 +423,14 @@ static int dsmark_dump_class(struct Qdisc *sch, unsigned long cl, if (!dsmark_valid_index(p, cl)) return -EINVAL; - tcm->tcm_handle = TC_H_MAKE(TC_H_MAJ(sch->handle), cl-1); + tcm->tcm_handle = TC_H_MAKE(TC_H_MAJ(sch->handle), cl - 1); tcm->tcm_info = p->q->handle; opts = nla_nest_start(skb, TCA_OPTIONS); if (opts == NULL) goto nla_put_failure; - NLA_PUT_U8(skb, TCA_DSMARK_MASK, p->mask[cl-1]); - NLA_PUT_U8(skb, TCA_DSMARK_VALUE, p->value[cl-1]); + NLA_PUT_U8(skb, TCA_DSMARK_MASK, p->mask[cl - 1]); + NLA_PUT_U8(skb, TCA_DSMARK_VALUE, p->value[cl - 1]); return nla_nest_end(skb, opts); diff --git a/net/sched/sch_fifo.c b/net/sched/sch_fifo.c index aa4d6337e43..be33f9ddf9d 100644 --- a/net/sched/sch_fifo.c +++ b/net/sched/sch_fifo.c @@ -19,12 +19,11 @@ /* 1 band FIFO pseudo-"scheduler" */ -struct fifo_sched_data -{ +struct fifo_sched_data { u32 limit; }; -static int bfifo_enqueue(struct sk_buff *skb, struct Qdisc* sch) +static int bfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch) { struct fifo_sched_data *q = qdisc_priv(sch); @@ -34,7 +33,7 @@ static int bfifo_enqueue(struct sk_buff *skb, struct Qdisc* sch) return qdisc_reshape_fail(skb, sch); } -static int pfifo_enqueue(struct sk_buff *skb, struct Qdisc* sch) +static int pfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch) { struct fifo_sched_data *q = qdisc_priv(sch); @@ -44,19 +43,16 @@ static int pfifo_enqueue(struct sk_buff *skb, struct Qdisc* sch) return qdisc_reshape_fail(skb, sch); } -static int pfifo_tail_enqueue(struct sk_buff *skb, struct Qdisc* sch) +static int pfifo_tail_enqueue(struct sk_buff *skb, struct Qdisc *sch) { - struct sk_buff *skb_head; struct fifo_sched_data *q = qdisc_priv(sch); if (likely(skb_queue_len(&sch->q) < q->limit)) return qdisc_enqueue_tail(skb, sch); /* queue full, remove one skb to fulfill the limit */ - skb_head = qdisc_dequeue_head(sch); + __qdisc_queue_drop_head(sch, &sch->q); sch->qstats.drops++; - kfree_skb(skb_head); - qdisc_enqueue_tail(skb, sch); return NET_XMIT_CN; @@ -65,11 +61,13 @@ static int pfifo_tail_enqueue(struct sk_buff *skb, struct Qdisc* sch) static int fifo_init(struct Qdisc *sch, struct nlattr *opt) { struct fifo_sched_data *q = qdisc_priv(sch); + bool bypass; + bool is_bfifo = sch->ops == &bfifo_qdisc_ops; if (opt == NULL) { u32 limit = qdisc_dev(sch)->tx_queue_len ? : 1; - if (sch->ops == &bfifo_qdisc_ops) + if (is_bfifo) limit *= psched_mtu(qdisc_dev(sch)); q->limit = limit; @@ -82,6 +80,15 @@ static int fifo_init(struct Qdisc *sch, struct nlattr *opt) q->limit = ctl->limit; } + if (is_bfifo) + bypass = q->limit >= psched_mtu(qdisc_dev(sch)); + else + bypass = q->limit >= 1; + + if (bypass) + sch->flags |= TCQ_F_CAN_BYPASS; + else + sch->flags &= ~TCQ_F_CAN_BYPASS; return 0; } diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 34dc598440a..0da09d50873 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -87,8 +87,8 @@ static inline int handle_dev_cpu_collision(struct sk_buff *skb, */ kfree_skb(skb); if (net_ratelimit()) - printk(KERN_WARNING "Dead loop on netdevice %s, " - "fix it urgently!\n", dev_queue->dev->name); + pr_warning("Dead loop on netdevice %s, fix it urgently!\n", + dev_queue->dev->name); ret = qdisc_qlen(q); } else { /* @@ -137,8 +137,8 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, } else { /* Driver returned NETDEV_TX_BUSY - requeue skb */ if (unlikely (ret != NETDEV_TX_BUSY && net_ratelimit())) - printk(KERN_WARNING "BUG %s code %d qlen %d\n", - dev->name, ret, q->q.qlen); + pr_warning("BUG %s code %d qlen %d\n", + dev->name, ret, q->q.qlen); ret = dev_requeue_skb(skb, q); } @@ -412,8 +412,9 @@ static struct Qdisc noqueue_qdisc = { }; -static const u8 prio2band[TC_PRIO_MAX+1] = - { 1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1 }; +static const u8 prio2band[TC_PRIO_MAX + 1] = { + 1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1 +}; /* 3-band FIFO queue: old style, but should be a bit faster than generic prio+fifo combination. @@ -445,7 +446,7 @@ static inline struct sk_buff_head *band2list(struct pfifo_fast_priv *priv, return priv->q + band; } -static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc* qdisc) +static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc) { if (skb_queue_len(&qdisc->q) < qdisc_dev(qdisc)->tx_queue_len) { int band = prio2band[skb->priority & TC_PRIO_MAX]; @@ -460,7 +461,7 @@ static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc* qdisc) return qdisc_drop(skb, qdisc); } -static struct sk_buff *pfifo_fast_dequeue(struct Qdisc* qdisc) +static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc) { struct pfifo_fast_priv *priv = qdisc_priv(qdisc); int band = bitmap2band[priv->bitmap]; @@ -479,7 +480,7 @@ static struct sk_buff *pfifo_fast_dequeue(struct Qdisc* qdisc) return NULL; } -static struct sk_buff *pfifo_fast_peek(struct Qdisc* qdisc) +static struct sk_buff *pfifo_fast_peek(struct Qdisc *qdisc) { struct pfifo_fast_priv *priv = qdisc_priv(qdisc); int band = bitmap2band[priv->bitmap]; @@ -493,7 +494,7 @@ static struct sk_buff *pfifo_fast_peek(struct Qdisc* qdisc) return NULL; } -static void pfifo_fast_reset(struct Qdisc* qdisc) +static void pfifo_fast_reset(struct Qdisc *qdisc) { int prio; struct pfifo_fast_priv *priv = qdisc_priv(qdisc); @@ -510,7 +511,7 @@ static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb) { struct tc_prio_qopt opt = { .bands = PFIFO_FAST_BANDS }; - memcpy(&opt.priomap, prio2band, TC_PRIO_MAX+1); + memcpy(&opt.priomap, prio2band, TC_PRIO_MAX + 1); NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); return skb->len; @@ -526,6 +527,8 @@ static int pfifo_fast_init(struct Qdisc *qdisc, struct nlattr *opt) for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) skb_queue_head_init(band2list(priv, prio)); + /* Can by-pass the queue discipline */ + qdisc->flags |= TCQ_F_CAN_BYPASS; return 0; } @@ -540,6 +543,7 @@ struct Qdisc_ops pfifo_fast_ops __read_mostly = { .dump = pfifo_fast_dump, .owner = THIS_MODULE, }; +EXPORT_SYMBOL(pfifo_fast_ops); struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, struct Qdisc_ops *ops) @@ -630,7 +634,7 @@ void qdisc_destroy(struct Qdisc *qdisc) #ifdef CONFIG_NET_SCHED qdisc_list_del(qdisc); - qdisc_put_stab(qdisc->stab); + qdisc_put_stab(rtnl_dereference(qdisc->stab)); #endif gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est); if (ops->reset) @@ -674,25 +678,21 @@ struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue, return oqdisc; } +EXPORT_SYMBOL(dev_graft_qdisc); static void attach_one_default_qdisc(struct net_device *dev, struct netdev_queue *dev_queue, void *_unused) { - struct Qdisc *qdisc; + struct Qdisc *qdisc = &noqueue_qdisc; if (dev->tx_queue_len) { qdisc = qdisc_create_dflt(dev_queue, &pfifo_fast_ops, TC_H_ROOT); if (!qdisc) { - printk(KERN_INFO "%s: activation failed\n", dev->name); + netdev_info(dev, "activation failed\n"); return; } - - /* Can by-pass the queue discipline for default qdisc */ - qdisc->flags |= TCQ_F_CAN_BYPASS; - } else { - qdisc = &noqueue_qdisc; } dev_queue->qdisc_sleeping = qdisc; } @@ -761,6 +761,7 @@ void dev_activate(struct net_device *dev) dev_watchdog_up(dev); } } +EXPORT_SYMBOL(dev_activate); static void dev_deactivate_queue(struct net_device *dev, struct netdev_queue *dev_queue, @@ -840,6 +841,7 @@ void dev_deactivate(struct net_device *dev) list_add(&dev->unreg_list, &single); dev_deactivate_many(&single); } +EXPORT_SYMBOL(dev_deactivate); static void dev_init_scheduler_queue(struct net_device *dev, struct netdev_queue *dev_queue, diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c index 51dcc2aa5c9..b9493a09a87 100644 --- a/net/sched/sch_gred.c +++ b/net/sched/sch_gred.c @@ -32,8 +32,7 @@ struct gred_sched_data; struct gred_sched; -struct gred_sched_data -{ +struct gred_sched_data { u32 limit; /* HARD maximal queue length */ u32 DP; /* the drop pramaters */ u32 bytesin; /* bytes seen on virtualQ so far*/ @@ -50,8 +49,7 @@ enum { GRED_RIO_MODE, }; -struct gred_sched -{ +struct gred_sched { struct gred_sched_data *tab[MAX_DPs]; unsigned long flags; u32 red_flags; @@ -150,17 +148,18 @@ static inline int gred_use_harddrop(struct gred_sched *t) return t->red_flags & TC_RED_HARDDROP; } -static int gred_enqueue(struct sk_buff *skb, struct Qdisc* sch) +static int gred_enqueue(struct sk_buff *skb, struct Qdisc *sch) { - struct gred_sched_data *q=NULL; - struct gred_sched *t= qdisc_priv(sch); + struct gred_sched_data *q = NULL; + struct gred_sched *t = qdisc_priv(sch); unsigned long qavg = 0; u16 dp = tc_index_to_dp(skb); - if (dp >= t->DPs || (q = t->tab[dp]) == NULL) { + if (dp >= t->DPs || (q = t->tab[dp]) == NULL) { dp = t->def; - if ((q = t->tab[dp]) == NULL) { + q = t->tab[dp]; + if (!q) { /* Pass through packets not assigned to a DP * if no default DP has been configured. This * allows for DP flows to be left untouched. @@ -183,7 +182,7 @@ static int gred_enqueue(struct sk_buff *skb, struct Qdisc* sch) for (i = 0; i < t->DPs; i++) { if (t->tab[i] && t->tab[i]->prio < q->prio && !red_is_idling(&t->tab[i]->parms)) - qavg +=t->tab[i]->parms.qavg; + qavg += t->tab[i]->parms.qavg; } } @@ -203,28 +202,28 @@ static int gred_enqueue(struct sk_buff *skb, struct Qdisc* sch) gred_store_wred_set(t, q); switch (red_action(&q->parms, q->parms.qavg + qavg)) { - case RED_DONT_MARK: - break; - - case RED_PROB_MARK: - sch->qstats.overlimits++; - if (!gred_use_ecn(t) || !INET_ECN_set_ce(skb)) { - q->stats.prob_drop++; - goto congestion_drop; - } - - q->stats.prob_mark++; - break; - - case RED_HARD_MARK: - sch->qstats.overlimits++; - if (gred_use_harddrop(t) || !gred_use_ecn(t) || - !INET_ECN_set_ce(skb)) { - q->stats.forced_drop++; - goto congestion_drop; - } - q->stats.forced_mark++; - break; + case RED_DONT_MARK: + break; + + case RED_PROB_MARK: + sch->qstats.overlimits++; + if (!gred_use_ecn(t) || !INET_ECN_set_ce(skb)) { + q->stats.prob_drop++; + goto congestion_drop; + } + + q->stats.prob_mark++; + break; + + case RED_HARD_MARK: + sch->qstats.overlimits++; + if (gred_use_harddrop(t) || !gred_use_ecn(t) || + !INET_ECN_set_ce(skb)) { + q->stats.forced_drop++; + goto congestion_drop; + } + q->stats.forced_mark++; + break; } if (q->backlog + qdisc_pkt_len(skb) <= q->limit) { @@ -241,7 +240,7 @@ congestion_drop: return NET_XMIT_CN; } -static struct sk_buff *gred_dequeue(struct Qdisc* sch) +static struct sk_buff *gred_dequeue(struct Qdisc *sch) { struct sk_buff *skb; struct gred_sched *t = qdisc_priv(sch); @@ -254,9 +253,9 @@ static struct sk_buff *gred_dequeue(struct Qdisc* sch) if (dp >= t->DPs || (q = t->tab[dp]) == NULL) { if (net_ratelimit()) - printk(KERN_WARNING "GRED: Unable to relocate " - "VQ 0x%x after dequeue, screwing up " - "backlog.\n", tc_index_to_dp(skb)); + pr_warning("GRED: Unable to relocate VQ 0x%x " + "after dequeue, screwing up " + "backlog.\n", tc_index_to_dp(skb)); } else { q->backlog -= qdisc_pkt_len(skb); @@ -273,7 +272,7 @@ static struct sk_buff *gred_dequeue(struct Qdisc* sch) return NULL; } -static unsigned int gred_drop(struct Qdisc* sch) +static unsigned int gred_drop(struct Qdisc *sch) { struct sk_buff *skb; struct gred_sched *t = qdisc_priv(sch); @@ -286,9 +285,9 @@ static unsigned int gred_drop(struct Qdisc* sch) if (dp >= t->DPs || (q = t->tab[dp]) == NULL) { if (net_ratelimit()) - printk(KERN_WARNING "GRED: Unable to relocate " - "VQ 0x%x while dropping, screwing up " - "backlog.\n", tc_index_to_dp(skb)); + pr_warning("GRED: Unable to relocate VQ 0x%x " + "while dropping, screwing up " + "backlog.\n", tc_index_to_dp(skb)); } else { q->backlog -= len; q->stats.other++; @@ -308,7 +307,7 @@ static unsigned int gred_drop(struct Qdisc* sch) } -static void gred_reset(struct Qdisc* sch) +static void gred_reset(struct Qdisc *sch) { int i; struct gred_sched *t = qdisc_priv(sch); @@ -369,8 +368,8 @@ static inline int gred_change_table_def(struct Qdisc *sch, struct nlattr *dps) for (i = table->DPs; i < MAX_DPs; i++) { if (table->tab[i]) { - printk(KERN_WARNING "GRED: Warning: Destroying " - "shadowed VQ 0x%x\n", i); + pr_warning("GRED: Warning: Destroying " + "shadowed VQ 0x%x\n", i); gred_destroy_vq(table->tab[i]); table->tab[i] = NULL; } diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index 2e45791d4f6..6488e642565 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -81,8 +81,7 @@ * that are expensive on 32-bit architectures. */ -struct internal_sc -{ +struct internal_sc { u64 sm1; /* scaled slope of the 1st segment */ u64 ism1; /* scaled inverse-slope of the 1st segment */ u64 dx; /* the x-projection of the 1st segment */ @@ -92,8 +91,7 @@ struct internal_sc }; /* runtime service curve */ -struct runtime_sc -{ +struct runtime_sc { u64 x; /* current starting position on x-axis */ u64 y; /* current starting position on y-axis */ u64 sm1; /* scaled slope of the 1st segment */ @@ -104,15 +102,13 @@ struct runtime_sc u64 ism2; /* scaled inverse-slope of the 2nd segment */ }; -enum hfsc_class_flags -{ +enum hfsc_class_flags { HFSC_RSC = 0x1, HFSC_FSC = 0x2, HFSC_USC = 0x4 }; -struct hfsc_class -{ +struct hfsc_class { struct Qdisc_class_common cl_common; unsigned int refcnt; /* usage count */ @@ -140,8 +136,8 @@ struct hfsc_class u64 cl_cumul; /* cumulative work in bytes done by real-time criteria */ - u64 cl_d; /* deadline*/ - u64 cl_e; /* eligible time */ + u64 cl_d; /* deadline*/ + u64 cl_e; /* eligible time */ u64 cl_vt; /* virtual time */ u64 cl_f; /* time when this class will fit for link-sharing, max(myf, cfmin) */ @@ -176,8 +172,7 @@ struct hfsc_class unsigned long cl_nactive; /* number of active children */ }; -struct hfsc_sched -{ +struct hfsc_sched { u16 defcls; /* default class id */ struct hfsc_class root; /* root class */ struct Qdisc_class_hash clhash; /* class hash */ @@ -693,7 +688,7 @@ init_vf(struct hfsc_class *cl, unsigned int len) if (go_active) { n = rb_last(&cl->cl_parent->vt_tree); if (n != NULL) { - max_cl = rb_entry(n, struct hfsc_class,vt_node); + max_cl = rb_entry(n, struct hfsc_class, vt_node); /* * set vt to the average of the min and max * classes. if the parent's period didn't @@ -1177,8 +1172,10 @@ hfsc_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) return NULL; } #endif - if ((cl = (struct hfsc_class *)res.class) == NULL) { - if ((cl = hfsc_find_class(res.classid, sch)) == NULL) + cl = (struct hfsc_class *)res.class; + if (!cl) { + cl = hfsc_find_class(res.classid, sch); + if (!cl) break; /* filter selected invalid classid */ if (cl->level >= head->level) break; /* filter may only point downwards */ @@ -1316,7 +1313,7 @@ hfsc_dump_sc(struct sk_buff *skb, int attr, struct internal_sc *sc) return -1; } -static inline int +static int hfsc_dump_curves(struct sk_buff *skb, struct hfsc_class *cl) { if ((cl->cl_flags & HFSC_RSC) && @@ -1420,7 +1417,8 @@ hfsc_schedule_watchdog(struct Qdisc *sch) struct hfsc_class *cl; u64 next_time = 0; - if ((cl = eltree_get_minel(q)) != NULL) + cl = eltree_get_minel(q); + if (cl) next_time = cl->cl_e; if (q->root.cl_cfmin != 0) { if (next_time == 0 || next_time > q->root.cl_cfmin) @@ -1600,7 +1598,6 @@ hfsc_enqueue(struct sk_buff *skb, struct Qdisc *sch) set_active(cl, qdisc_pkt_len(skb)); bstats_update(&cl->bstats, skb); - qdisc_bstats_update(sch, skb); sch->q.qlen++; return NET_XMIT_SUCCESS; @@ -1626,7 +1623,8 @@ hfsc_dequeue(struct Qdisc *sch) * find the class with the minimum deadline among * the eligible classes. */ - if ((cl = eltree_get_mindl(q, cur_time)) != NULL) { + cl = eltree_get_mindl(q, cur_time); + if (cl) { realtime = 1; } else { /* @@ -1665,7 +1663,8 @@ hfsc_dequeue(struct Qdisc *sch) set_passive(cl); } - sch->flags &= ~TCQ_F_THROTTLED; + qdisc_unthrottled(sch); + qdisc_bstats_update(sch, skb); sch->q.qlen--; return skb; diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 984c1b0c683..e1429a85091 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -99,9 +99,10 @@ struct htb_class { struct rb_root feed[TC_HTB_NUMPRIO]; /* feed trees */ struct rb_node *ptr[TC_HTB_NUMPRIO]; /* current class ptr */ /* When class changes from state 1->2 and disconnects from - parent's feed then we lost ptr value and start from the - first child again. Here we store classid of the - last valid ptr (used when ptr is NULL). */ + * parent's feed then we lost ptr value and start from the + * first child again. Here we store classid of the + * last valid ptr (used when ptr is NULL). + */ u32 last_ptr_id[TC_HTB_NUMPRIO]; } inner; } un; @@ -185,7 +186,7 @@ static inline struct htb_class *htb_find(u32 handle, struct Qdisc *sch) * have no valid leaf we try to use MAJOR:default leaf. It still unsuccessfull * then finish and return direct queue. */ -#define HTB_DIRECT (struct htb_class*)-1 +#define HTB_DIRECT ((struct htb_class *)-1L) static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) @@ -197,11 +198,13 @@ static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch, int result; /* allow to select class by setting skb->priority to valid classid; - note that nfmark can be used too by attaching filter fw with no - rules in it */ + * note that nfmark can be used too by attaching filter fw with no + * rules in it + */ if (skb->priority == sch->handle) return HTB_DIRECT; /* X:0 (direct flow) selected */ - if ((cl = htb_find(skb->priority, sch)) != NULL && cl->level == 0) + cl = htb_find(skb->priority, sch); + if (cl && cl->level == 0) return cl; *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; @@ -216,10 +219,12 @@ static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch, return NULL; } #endif - if ((cl = (void *)res.class) == NULL) { + cl = (void *)res.class; + if (!cl) { if (res.classid == sch->handle) return HTB_DIRECT; /* X:0 (direct flow) */ - if ((cl = htb_find(res.classid, sch)) == NULL) + cl = htb_find(res.classid, sch); + if (!cl) break; /* filter selected invalid classid */ } if (!cl->level) @@ -378,7 +383,8 @@ static void htb_activate_prios(struct htb_sched *q, struct htb_class *cl) if (p->un.inner.feed[prio].rb_node) /* parent already has its feed in use so that - reset bit in mask as parent is already ok */ + * reset bit in mask as parent is already ok + */ mask &= ~(1 << prio); htb_add_to_id_tree(p->un.inner.feed + prio, cl, prio); @@ -413,8 +419,9 @@ static void htb_deactivate_prios(struct htb_sched *q, struct htb_class *cl) if (p->un.inner.ptr[prio] == cl->node + prio) { /* we are removing child which is pointed to from - parent feed - forget the pointer but remember - classid */ + * parent feed - forget the pointer but remember + * classid + */ p->un.inner.last_ptr_id[prio] = cl->common.classid; p->un.inner.ptr[prio] = NULL; } @@ -574,7 +581,6 @@ static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch) } sch->q.qlen++; - qdisc_bstats_update(sch, skb); return NET_XMIT_SUCCESS; } @@ -664,8 +670,9 @@ static psched_time_t htb_do_events(struct htb_sched *q, int level, unsigned long start) { /* don't run for longer than 2 jiffies; 2 is used instead of - 1 to simplify things when jiffy is going to be incremented - too soon */ + * 1 to simplify things when jiffy is going to be incremented + * too soon + */ unsigned long stop_at = start + 2; while (time_before(jiffies, stop_at)) { struct htb_class *cl; @@ -688,7 +695,7 @@ static psched_time_t htb_do_events(struct htb_sched *q, int level, /* too much load - let's continue after a break for scheduling */ if (!(q->warned & HTB_WARN_TOOMANYEVENTS)) { - printk(KERN_WARNING "htb: too many events!\n"); + pr_warning("htb: too many events!\n"); q->warned |= HTB_WARN_TOOMANYEVENTS; } @@ -696,7 +703,8 @@ static psched_time_t htb_do_events(struct htb_sched *q, int level, } /* Returns class->node+prio from id-tree where classe's id is >= id. NULL - is no such one exists. */ + * is no such one exists. + */ static struct rb_node *htb_id_find_next_upper(int prio, struct rb_node *n, u32 id) { @@ -740,12 +748,14 @@ static struct htb_class *htb_lookup_leaf(struct rb_root *tree, int prio, for (i = 0; i < 65535; i++) { if (!*sp->pptr && *sp->pid) { /* ptr was invalidated but id is valid - try to recover - the original or next ptr */ + * the original or next ptr + */ *sp->pptr = htb_id_find_next_upper(prio, sp->root, *sp->pid); } *sp->pid = 0; /* ptr is valid now so that remove this hint as it - can become out of date quickly */ + * can become out of date quickly + */ if (!*sp->pptr) { /* we are at right end; rewind & go up */ *sp->pptr = sp->root; while ((*sp->pptr)->rb_left) @@ -773,7 +783,8 @@ static struct htb_class *htb_lookup_leaf(struct rb_root *tree, int prio, } /* dequeues packet at given priority and level; call only if - you are sure that there is active class at prio/level */ + * you are sure that there is active class at prio/level + */ static struct sk_buff *htb_dequeue_tree(struct htb_sched *q, int prio, int level) { @@ -790,9 +801,10 @@ next: return NULL; /* class can be empty - it is unlikely but can be true if leaf - qdisc drops packets in enqueue routine or if someone used - graft operation on the leaf since last dequeue; - simply deactivate and skip such class */ + * qdisc drops packets in enqueue routine or if someone used + * graft operation on the leaf since last dequeue; + * simply deactivate and skip such class + */ if (unlikely(cl->un.leaf.q->q.qlen == 0)) { struct htb_class *next; htb_deactivate(q, cl); @@ -832,7 +844,8 @@ next: ptr[0]) + prio); } /* this used to be after charge_class but this constelation - gives us slightly better performance */ + * gives us slightly better performance + */ if (!cl->un.leaf.q->q.qlen) htb_deactivate(q, cl); htb_charge_class(q, cl, level, skb); @@ -842,7 +855,7 @@ next: static struct sk_buff *htb_dequeue(struct Qdisc *sch) { - struct sk_buff *skb = NULL; + struct sk_buff *skb; struct htb_sched *q = qdisc_priv(sch); int level; psched_time_t next_event; @@ -851,7 +864,9 @@ static struct sk_buff *htb_dequeue(struct Qdisc *sch) /* try to dequeue direct packets as high prio (!) to minimize cpu work */ skb = __skb_dequeue(&q->direct_queue); if (skb != NULL) { - sch->flags &= ~TCQ_F_THROTTLED; +ok: + qdisc_bstats_update(sch, skb); + qdisc_unthrottled(sch); sch->q.qlen--; return skb; } @@ -882,13 +897,11 @@ static struct sk_buff *htb_dequeue(struct Qdisc *sch) m = ~q->row_mask[level]; while (m != (int)(-1)) { int prio = ffz(m); + m |= 1 << prio; skb = htb_dequeue_tree(q, prio, level); - if (likely(skb != NULL)) { - sch->q.qlen--; - sch->flags &= ~TCQ_F_THROTTLED; - goto fin; - } + if (likely(skb != NULL)) + goto ok; } } sch->qstats.overlimits++; @@ -989,13 +1002,12 @@ static int htb_init(struct Qdisc *sch, struct nlattr *opt) return err; if (tb[TCA_HTB_INIT] == NULL) { - printk(KERN_ERR "HTB: hey probably you have bad tc tool ?\n"); + pr_err("HTB: hey probably you have bad tc tool ?\n"); return -EINVAL; } gopt = nla_data(tb[TCA_HTB_INIT]); if (gopt->version != HTB_VER >> 16) { - printk(KERN_ERR - "HTB: need tc/htb version %d (minor is %d), you have %d\n", + pr_err("HTB: need tc/htb version %d (minor is %d), you have %d\n", HTB_VER >> 16, HTB_VER & 0xffff, gopt->version); return -EINVAL; } @@ -1208,9 +1220,10 @@ static void htb_destroy(struct Qdisc *sch) cancel_work_sync(&q->work); qdisc_watchdog_cancel(&q->watchdog); /* This line used to be after htb_destroy_class call below - and surprisingly it worked in 2.4. But it must precede it - because filter need its target class alive to be able to call - unbind_filter on it (without Oops). */ + * and surprisingly it worked in 2.4. But it must precede it + * because filter need its target class alive to be able to call + * unbind_filter on it (without Oops). + */ tcf_destroy_chain(&q->filter_list); for (i = 0; i < q->clhash.hashsize; i++) { @@ -1344,11 +1357,12 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, /* check maximal depth */ if (parent && parent->parent && parent->parent->level < 2) { - printk(KERN_ERR "htb: tree is too deep\n"); + pr_err("htb: tree is too deep\n"); goto failure; } err = -ENOBUFS; - if ((cl = kzalloc(sizeof(*cl), GFP_KERNEL)) == NULL) + cl = kzalloc(sizeof(*cl), GFP_KERNEL); + if (!cl) goto failure; err = gen_new_estimator(&cl->bstats, &cl->rate_est, @@ -1368,8 +1382,9 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, RB_CLEAR_NODE(&cl->node[prio]); /* create leaf qdisc early because it uses kmalloc(GFP_KERNEL) - so that can't be used inside of sch_tree_lock - -- thanks to Karlis Peisenieks */ + * so that can't be used inside of sch_tree_lock + * -- thanks to Karlis Peisenieks + */ new_q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid); sch_tree_lock(sch); @@ -1421,17 +1436,18 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, } /* it used to be a nasty bug here, we have to check that node - is really leaf before changing cl->un.leaf ! */ + * is really leaf before changing cl->un.leaf ! + */ if (!cl->level) { cl->quantum = rtab->rate.rate / q->rate2quantum; if (!hopt->quantum && cl->quantum < 1000) { - printk(KERN_WARNING + pr_warning( "HTB: quantum of class %X is small. Consider r2q change.\n", cl->common.classid); cl->quantum = 1000; } if (!hopt->quantum && cl->quantum > 200000) { - printk(KERN_WARNING + pr_warning( "HTB: quantum of class %X is big. Consider r2q change.\n", cl->common.classid); cl->quantum = 200000; @@ -1480,13 +1496,13 @@ static unsigned long htb_bind_filter(struct Qdisc *sch, unsigned long parent, struct htb_class *cl = htb_find(classid, sch); /*if (cl && !cl->level) return 0; - The line above used to be there to prevent attaching filters to - leaves. But at least tc_index filter uses this just to get class - for other reasons so that we have to allow for it. - ---- - 19.6.2002 As Werner explained it is ok - bind filter is just - another way to "lock" the class - unlike "get" this lock can - be broken by class during destroy IIUC. + * The line above used to be there to prevent attaching filters to + * leaves. But at least tc_index filter uses this just to get class + * for other reasons so that we have to allow for it. + * ---- + * 19.6.2002 As Werner explained it is ok - bind filter is just + * another way to "lock" the class - unlike "get" this lock can + * be broken by class during destroy IIUC. */ if (cl) cl->filter_cnt++; diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c index ecc302f4d2a..ec5cbc84896 100644 --- a/net/sched/sch_mq.c +++ b/net/sched/sch_mq.c @@ -61,7 +61,6 @@ static int mq_init(struct Qdisc *sch, struct nlattr *opt) TC_H_MIN(ntx + 1))); if (qdisc == NULL) goto err; - qdisc->flags |= TCQ_F_CAN_BYPASS; priv->qdiscs[ntx] = qdisc; } diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c new file mode 100644 index 00000000000..fbc6f53cb1b --- /dev/null +++ b/net/sched/sch_mqprio.c @@ -0,0 +1,416 @@ +/* + * net/sched/sch_mqprio.c + * + * Copyright (c) 2010 John Fastabend <john.r.fastabend@intel.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + */ + +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/kernel.h> +#include <linux/string.h> +#include <linux/errno.h> +#include <linux/skbuff.h> +#include <net/netlink.h> +#include <net/pkt_sched.h> +#include <net/sch_generic.h> + +struct mqprio_sched { + struct Qdisc **qdiscs; + int hw_owned; +}; + +static void mqprio_destroy(struct Qdisc *sch) +{ + struct net_device *dev = qdisc_dev(sch); + struct mqprio_sched *priv = qdisc_priv(sch); + unsigned int ntx; + + if (!priv->qdiscs) + return; + + for (ntx = 0; ntx < dev->num_tx_queues && priv->qdiscs[ntx]; ntx++) + qdisc_destroy(priv->qdiscs[ntx]); + + if (priv->hw_owned && dev->netdev_ops->ndo_setup_tc) + dev->netdev_ops->ndo_setup_tc(dev, 0); + else + netdev_set_num_tc(dev, 0); + + kfree(priv->qdiscs); +} + +static int mqprio_parse_opt(struct net_device *dev, struct tc_mqprio_qopt *qopt) +{ + int i, j; + + /* Verify num_tc is not out of max range */ + if (qopt->num_tc > TC_MAX_QUEUE) + return -EINVAL; + + /* Verify priority mapping uses valid tcs */ + for (i = 0; i < TC_BITMASK + 1; i++) { + if (qopt->prio_tc_map[i] >= qopt->num_tc) + return -EINVAL; + } + + /* net_device does not support requested operation */ + if (qopt->hw && !dev->netdev_ops->ndo_setup_tc) + return -EINVAL; + + /* if hw owned qcount and qoffset are taken from LLD so + * no reason to verify them here + */ + if (qopt->hw) + return 0; + + for (i = 0; i < qopt->num_tc; i++) { + unsigned int last = qopt->offset[i] + qopt->count[i]; + + /* Verify the queue count is in tx range being equal to the + * real_num_tx_queues indicates the last queue is in use. + */ + if (qopt->offset[i] >= dev->real_num_tx_queues || + !qopt->count[i] || + last > dev->real_num_tx_queues) + return -EINVAL; + + /* Verify that the offset and counts do not overlap */ + for (j = i + 1; j < qopt->num_tc; j++) { + if (last > qopt->offset[j]) + return -EINVAL; + } + } + + return 0; +} + +static int mqprio_init(struct Qdisc *sch, struct nlattr *opt) +{ + struct net_device *dev = qdisc_dev(sch); + struct mqprio_sched *priv = qdisc_priv(sch); + struct netdev_queue *dev_queue; + struct Qdisc *qdisc; + int i, err = -EOPNOTSUPP; + struct tc_mqprio_qopt *qopt = NULL; + + BUILD_BUG_ON(TC_MAX_QUEUE != TC_QOPT_MAX_QUEUE); + BUILD_BUG_ON(TC_BITMASK != TC_QOPT_BITMASK); + + if (sch->parent != TC_H_ROOT) + return -EOPNOTSUPP; + + if (!netif_is_multiqueue(dev)) + return -EOPNOTSUPP; + + if (nla_len(opt) < sizeof(*qopt)) + return -EINVAL; + + qopt = nla_data(opt); + if (mqprio_parse_opt(dev, qopt)) + return -EINVAL; + + /* pre-allocate qdisc, attachment can't fail */ + priv->qdiscs = kcalloc(dev->num_tx_queues, sizeof(priv->qdiscs[0]), + GFP_KERNEL); + if (priv->qdiscs == NULL) { + err = -ENOMEM; + goto err; + } + + for (i = 0; i < dev->num_tx_queues; i++) { + dev_queue = netdev_get_tx_queue(dev, i); + qdisc = qdisc_create_dflt(dev_queue, &pfifo_fast_ops, + TC_H_MAKE(TC_H_MAJ(sch->handle), + TC_H_MIN(i + 1))); + if (qdisc == NULL) { + err = -ENOMEM; + goto err; + } + priv->qdiscs[i] = qdisc; + } + + /* If the mqprio options indicate that hardware should own + * the queue mapping then run ndo_setup_tc otherwise use the + * supplied and verified mapping + */ + if (qopt->hw) { + priv->hw_owned = 1; + err = dev->netdev_ops->ndo_setup_tc(dev, qopt->num_tc); + if (err) + goto err; + } else { + netdev_set_num_tc(dev, qopt->num_tc); + for (i = 0; i < qopt->num_tc; i++) + netdev_set_tc_queue(dev, i, + qopt->count[i], qopt->offset[i]); + } + + /* Always use supplied priority mappings */ + for (i = 0; i < TC_BITMASK + 1; i++) + netdev_set_prio_tc_map(dev, i, qopt->prio_tc_map[i]); + + sch->flags |= TCQ_F_MQROOT; + return 0; + +err: + mqprio_destroy(sch); + return err; +} + +static void mqprio_attach(struct Qdisc *sch) +{ + struct net_device *dev = qdisc_dev(sch); + struct mqprio_sched *priv = qdisc_priv(sch); + struct Qdisc *qdisc; + unsigned int ntx; + + /* Attach underlying qdisc */ + for (ntx = 0; ntx < dev->num_tx_queues; ntx++) { + qdisc = priv->qdiscs[ntx]; + qdisc = dev_graft_qdisc(qdisc->dev_queue, qdisc); + if (qdisc) + qdisc_destroy(qdisc); + } + kfree(priv->qdiscs); + priv->qdiscs = NULL; +} + +static struct netdev_queue *mqprio_queue_get(struct Qdisc *sch, + unsigned long cl) +{ + struct net_device *dev = qdisc_dev(sch); + unsigned long ntx = cl - 1 - netdev_get_num_tc(dev); + + if (ntx >= dev->num_tx_queues) + return NULL; + return netdev_get_tx_queue(dev, ntx); +} + +static int mqprio_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new, + struct Qdisc **old) +{ + struct net_device *dev = qdisc_dev(sch); + struct netdev_queue *dev_queue = mqprio_queue_get(sch, cl); + + if (!dev_queue) + return -EINVAL; + + if (dev->flags & IFF_UP) + dev_deactivate(dev); + + *old = dev_graft_qdisc(dev_queue, new); + + if (dev->flags & IFF_UP) + dev_activate(dev); + + return 0; +} + +static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct net_device *dev = qdisc_dev(sch); + struct mqprio_sched *priv = qdisc_priv(sch); + unsigned char *b = skb_tail_pointer(skb); + struct tc_mqprio_qopt opt; + struct Qdisc *qdisc; + unsigned int i; + + sch->q.qlen = 0; + memset(&sch->bstats, 0, sizeof(sch->bstats)); + memset(&sch->qstats, 0, sizeof(sch->qstats)); + + for (i = 0; i < dev->num_tx_queues; i++) { + qdisc = netdev_get_tx_queue(dev, i)->qdisc; + spin_lock_bh(qdisc_lock(qdisc)); + sch->q.qlen += qdisc->q.qlen; + sch->bstats.bytes += qdisc->bstats.bytes; + sch->bstats.packets += qdisc->bstats.packets; + sch->qstats.qlen += qdisc->qstats.qlen; + sch->qstats.backlog += qdisc->qstats.backlog; + sch->qstats.drops += qdisc->qstats.drops; + sch->qstats.requeues += qdisc->qstats.requeues; + sch->qstats.overlimits += qdisc->qstats.overlimits; + spin_unlock_bh(qdisc_lock(qdisc)); + } + + opt.num_tc = netdev_get_num_tc(dev); + memcpy(opt.prio_tc_map, dev->prio_tc_map, sizeof(opt.prio_tc_map)); + opt.hw = priv->hw_owned; + + for (i = 0; i < netdev_get_num_tc(dev); i++) { + opt.count[i] = dev->tc_to_txq[i].count; + opt.offset[i] = dev->tc_to_txq[i].offset; + } + + NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); + + return skb->len; +nla_put_failure: + nlmsg_trim(skb, b); + return -1; +} + +static struct Qdisc *mqprio_leaf(struct Qdisc *sch, unsigned long cl) +{ + struct netdev_queue *dev_queue = mqprio_queue_get(sch, cl); + + if (!dev_queue) + return NULL; + + return dev_queue->qdisc_sleeping; +} + +static unsigned long mqprio_get(struct Qdisc *sch, u32 classid) +{ + struct net_device *dev = qdisc_dev(sch); + unsigned int ntx = TC_H_MIN(classid); + + if (ntx > dev->num_tx_queues + netdev_get_num_tc(dev)) + return 0; + return ntx; +} + +static void mqprio_put(struct Qdisc *sch, unsigned long cl) +{ +} + +static int mqprio_dump_class(struct Qdisc *sch, unsigned long cl, + struct sk_buff *skb, struct tcmsg *tcm) +{ + struct net_device *dev = qdisc_dev(sch); + + if (cl <= netdev_get_num_tc(dev)) { + tcm->tcm_parent = TC_H_ROOT; + tcm->tcm_info = 0; + } else { + int i; + struct netdev_queue *dev_queue; + + dev_queue = mqprio_queue_get(sch, cl); + tcm->tcm_parent = 0; + for (i = 0; i < netdev_get_num_tc(dev); i++) { + struct netdev_tc_txq tc = dev->tc_to_txq[i]; + int q_idx = cl - netdev_get_num_tc(dev); + + if (q_idx > tc.offset && + q_idx <= tc.offset + tc.count) { + tcm->tcm_parent = + TC_H_MAKE(TC_H_MAJ(sch->handle), + TC_H_MIN(i + 1)); + break; + } + } + tcm->tcm_info = dev_queue->qdisc_sleeping->handle; + } + tcm->tcm_handle |= TC_H_MIN(cl); + return 0; +} + +static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl, + struct gnet_dump *d) +{ + struct net_device *dev = qdisc_dev(sch); + + if (cl <= netdev_get_num_tc(dev)) { + int i; + struct Qdisc *qdisc; + struct gnet_stats_queue qstats = {0}; + struct gnet_stats_basic_packed bstats = {0}; + struct netdev_tc_txq tc = dev->tc_to_txq[cl - 1]; + + /* Drop lock here it will be reclaimed before touching + * statistics this is required because the d->lock we + * hold here is the look on dev_queue->qdisc_sleeping + * also acquired below. + */ + spin_unlock_bh(d->lock); + + for (i = tc.offset; i < tc.offset + tc.count; i++) { + qdisc = netdev_get_tx_queue(dev, i)->qdisc; + spin_lock_bh(qdisc_lock(qdisc)); + bstats.bytes += qdisc->bstats.bytes; + bstats.packets += qdisc->bstats.packets; + qstats.qlen += qdisc->qstats.qlen; + qstats.backlog += qdisc->qstats.backlog; + qstats.drops += qdisc->qstats.drops; + qstats.requeues += qdisc->qstats.requeues; + qstats.overlimits += qdisc->qstats.overlimits; + spin_unlock_bh(qdisc_lock(qdisc)); + } + /* Reclaim root sleeping lock before completing stats */ + spin_lock_bh(d->lock); + if (gnet_stats_copy_basic(d, &bstats) < 0 || + gnet_stats_copy_queue(d, &qstats) < 0) + return -1; + } else { + struct netdev_queue *dev_queue = mqprio_queue_get(sch, cl); + + sch = dev_queue->qdisc_sleeping; + sch->qstats.qlen = sch->q.qlen; + if (gnet_stats_copy_basic(d, &sch->bstats) < 0 || + gnet_stats_copy_queue(d, &sch->qstats) < 0) + return -1; + } + return 0; +} + +static void mqprio_walk(struct Qdisc *sch, struct qdisc_walker *arg) +{ + struct net_device *dev = qdisc_dev(sch); + unsigned long ntx; + + if (arg->stop) + return; + + /* Walk hierarchy with a virtual class per tc */ + arg->count = arg->skip; + for (ntx = arg->skip; + ntx < dev->num_tx_queues + netdev_get_num_tc(dev); + ntx++) { + if (arg->fn(sch, ntx + 1, arg) < 0) { + arg->stop = 1; + break; + } + arg->count++; + } +} + +static const struct Qdisc_class_ops mqprio_class_ops = { + .graft = mqprio_graft, + .leaf = mqprio_leaf, + .get = mqprio_get, + .put = mqprio_put, + .walk = mqprio_walk, + .dump = mqprio_dump_class, + .dump_stats = mqprio_dump_class_stats, +}; + +struct Qdisc_ops mqprio_qdisc_ops __read_mostly = { + .cl_ops = &mqprio_class_ops, + .id = "mqprio", + .priv_size = sizeof(struct mqprio_sched), + .init = mqprio_init, + .destroy = mqprio_destroy, + .attach = mqprio_attach, + .dump = mqprio_dump, + .owner = THIS_MODULE, +}; + +static int __init mqprio_module_init(void) +{ + return register_qdisc(&mqprio_qdisc_ops); +} + +static void __exit mqprio_module_exit(void) +{ + unregister_qdisc(&mqprio_qdisc_ops); +} + +module_init(mqprio_module_init); +module_exit(mqprio_module_exit); + +MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c index 21f13da2476..edc1950e0e7 100644 --- a/net/sched/sch_multiq.c +++ b/net/sched/sch_multiq.c @@ -83,7 +83,6 @@ multiq_enqueue(struct sk_buff *skb, struct Qdisc *sch) ret = qdisc_enqueue(skb, qdisc); if (ret == NET_XMIT_SUCCESS) { - qdisc_bstats_update(sch, skb); sch->q.qlen++; return NET_XMIT_SUCCESS; } @@ -112,6 +111,7 @@ static struct sk_buff *multiq_dequeue(struct Qdisc *sch) qdisc = q->queues[q->curband]; skb = qdisc->dequeue(qdisc); if (skb) { + qdisc_bstats_update(sch, skb); sch->q.qlen--; return skb; } @@ -156,7 +156,7 @@ static unsigned int multiq_drop(struct Qdisc *sch) unsigned int len; struct Qdisc *qdisc; - for (band = q->bands-1; band >= 0; band--) { + for (band = q->bands - 1; band >= 0; band--) { qdisc = q->queues[band]; if (qdisc->ops->drop) { len = qdisc->ops->drop(qdisc); @@ -265,7 +265,7 @@ static int multiq_init(struct Qdisc *sch, struct nlattr *opt) for (i = 0; i < q->max_bands; i++) q->queues[i] = &noop_qdisc; - err = multiq_tune(sch,opt); + err = multiq_tune(sch, opt); if (err) kfree(q->queues); @@ -346,7 +346,7 @@ static int multiq_dump_class(struct Qdisc *sch, unsigned long cl, struct multiq_sched_data *q = qdisc_priv(sch); tcm->tcm_handle |= TC_H_MIN(cl); - tcm->tcm_info = q->queues[cl-1]->handle; + tcm->tcm_info = q->queues[cl - 1]->handle; return 0; } @@ -378,7 +378,7 @@ static void multiq_walk(struct Qdisc *sch, struct qdisc_walker *arg) arg->count++; continue; } - if (arg->fn(sch, band+1, arg) < 0) { + if (arg->fn(sch, band + 1, arg) < 0) { arg->stop = 1; break; } diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 1c4bce86347..64f0d3293b4 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -211,8 +211,8 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) } cb = netem_skb_cb(skb); - if (q->gap == 0 || /* not doing reordering */ - q->counter < q->gap || /* inside last reordering gap */ + if (q->gap == 0 || /* not doing reordering */ + q->counter < q->gap || /* inside last reordering gap */ q->reorder < get_crandom(&q->reorder_cor)) { psched_time_t now; psched_tdiff_t delay; @@ -240,7 +240,6 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) if (likely(ret == NET_XMIT_SUCCESS)) { sch->q.qlen++; - qdisc_bstats_update(sch, skb); } else if (net_xmit_drop_count(ret)) { sch->qstats.drops++; } @@ -249,7 +248,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) return ret; } -static unsigned int netem_drop(struct Qdisc* sch) +static unsigned int netem_drop(struct Qdisc *sch) { struct netem_sched_data *q = qdisc_priv(sch); unsigned int len = 0; @@ -266,7 +265,7 @@ static struct sk_buff *netem_dequeue(struct Qdisc *sch) struct netem_sched_data *q = qdisc_priv(sch); struct sk_buff *skb; - if (sch->flags & TCQ_F_THROTTLED) + if (qdisc_is_throttled(sch)) return NULL; skb = q->qdisc->ops->peek(q->qdisc); @@ -289,6 +288,7 @@ static struct sk_buff *netem_dequeue(struct Qdisc *sch) skb->tstamp.tv64 = 0; #endif pr_debug("netem_dequeue: return skb=%p\n", skb); + qdisc_bstats_update(sch, skb); sch->q.qlen--; return skb; } @@ -476,7 +476,6 @@ static int tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch) __skb_queue_after(list, skb, nskb); sch->qstats.backlog += qdisc_pkt_len(nskb); - qdisc_bstats_update(sch, nskb); return NET_XMIT_SUCCESS; } diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c index 966158d49dd..2a318f2dc3e 100644 --- a/net/sched/sch_prio.c +++ b/net/sched/sch_prio.c @@ -22,8 +22,7 @@ #include <net/pkt_sched.h> -struct prio_sched_data -{ +struct prio_sched_data { int bands; struct tcf_proto *filter_list; u8 prio2band[TC_PRIO_MAX+1]; @@ -54,7 +53,7 @@ prio_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) if (!q->filter_list || err < 0) { if (TC_H_MAJ(band)) band = 0; - return q->queues[q->prio2band[band&TC_PRIO_MAX]]; + return q->queues[q->prio2band[band & TC_PRIO_MAX]]; } band = res.classid; } @@ -84,7 +83,6 @@ prio_enqueue(struct sk_buff *skb, struct Qdisc *sch) ret = qdisc_enqueue(skb, qdisc); if (ret == NET_XMIT_SUCCESS) { - qdisc_bstats_update(sch, skb); sch->q.qlen++; return NET_XMIT_SUCCESS; } @@ -107,7 +105,7 @@ static struct sk_buff *prio_peek(struct Qdisc *sch) return NULL; } -static struct sk_buff *prio_dequeue(struct Qdisc* sch) +static struct sk_buff *prio_dequeue(struct Qdisc *sch) { struct prio_sched_data *q = qdisc_priv(sch); int prio; @@ -116,6 +114,7 @@ static struct sk_buff *prio_dequeue(struct Qdisc* sch) struct Qdisc *qdisc = q->queues[prio]; struct sk_buff *skb = qdisc->dequeue(qdisc); if (skb) { + qdisc_bstats_update(sch, skb); sch->q.qlen--; return skb; } @@ -124,7 +123,7 @@ static struct sk_buff *prio_dequeue(struct Qdisc* sch) } -static unsigned int prio_drop(struct Qdisc* sch) +static unsigned int prio_drop(struct Qdisc *sch) { struct prio_sched_data *q = qdisc_priv(sch); int prio; @@ -143,24 +142,24 @@ static unsigned int prio_drop(struct Qdisc* sch) static void -prio_reset(struct Qdisc* sch) +prio_reset(struct Qdisc *sch) { int prio; struct prio_sched_data *q = qdisc_priv(sch); - for (prio=0; prio<q->bands; prio++) + for (prio = 0; prio < q->bands; prio++) qdisc_reset(q->queues[prio]); sch->q.qlen = 0; } static void -prio_destroy(struct Qdisc* sch) +prio_destroy(struct Qdisc *sch) { int prio; struct prio_sched_data *q = qdisc_priv(sch); tcf_destroy_chain(&q->filter_list); - for (prio=0; prio<q->bands; prio++) + for (prio = 0; prio < q->bands; prio++) qdisc_destroy(q->queues[prio]); } @@ -177,7 +176,7 @@ static int prio_tune(struct Qdisc *sch, struct nlattr *opt) if (qopt->bands > TCQ_PRIO_BANDS || qopt->bands < 2) return -EINVAL; - for (i=0; i<=TC_PRIO_MAX; i++) { + for (i = 0; i <= TC_PRIO_MAX; i++) { if (qopt->priomap[i] >= qopt->bands) return -EINVAL; } @@ -186,7 +185,7 @@ static int prio_tune(struct Qdisc *sch, struct nlattr *opt) q->bands = qopt->bands; memcpy(q->prio2band, qopt->priomap, TC_PRIO_MAX+1); - for (i=q->bands; i<TCQ_PRIO_BANDS; i++) { + for (i = q->bands; i < TCQ_PRIO_BANDS; i++) { struct Qdisc *child = q->queues[i]; q->queues[i] = &noop_qdisc; if (child != &noop_qdisc) { @@ -196,9 +195,10 @@ static int prio_tune(struct Qdisc *sch, struct nlattr *opt) } sch_tree_unlock(sch); - for (i=0; i<q->bands; i++) { + for (i = 0; i < q->bands; i++) { if (q->queues[i] == &noop_qdisc) { struct Qdisc *child, *old; + child = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, TC_H_MAKE(sch->handle, i + 1)); @@ -224,7 +224,7 @@ static int prio_init(struct Qdisc *sch, struct nlattr *opt) struct prio_sched_data *q = qdisc_priv(sch); int i; - for (i=0; i<TCQ_PRIO_BANDS; i++) + for (i = 0; i < TCQ_PRIO_BANDS; i++) q->queues[i] = &noop_qdisc; if (opt == NULL) { @@ -232,7 +232,7 @@ static int prio_init(struct Qdisc *sch, struct nlattr *opt) } else { int err; - if ((err= prio_tune(sch, opt)) != 0) + if ((err = prio_tune(sch, opt)) != 0) return err; } return 0; @@ -245,7 +245,7 @@ static int prio_dump(struct Qdisc *sch, struct sk_buff *skb) struct tc_prio_qopt opt; opt.bands = q->bands; - memcpy(&opt.priomap, q->prio2band, TC_PRIO_MAX+1); + memcpy(&opt.priomap, q->prio2band, TC_PRIO_MAX + 1); NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); @@ -342,7 +342,7 @@ static void prio_walk(struct Qdisc *sch, struct qdisc_walker *arg) arg->count++; continue; } - if (arg->fn(sch, prio+1, arg) < 0) { + if (arg->fn(sch, prio + 1, arg) < 0) { arg->stop = 1; break; } @@ -350,7 +350,7 @@ static void prio_walk(struct Qdisc *sch, struct qdisc_walker *arg) } } -static struct tcf_proto ** prio_find_tcf(struct Qdisc *sch, unsigned long cl) +static struct tcf_proto **prio_find_tcf(struct Qdisc *sch, unsigned long cl) { struct prio_sched_data *q = qdisc_priv(sch); diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c index a6009c5a2c9..6649463da1b 100644 --- a/net/sched/sch_red.c +++ b/net/sched/sch_red.c @@ -36,8 +36,7 @@ if RED works correctly. */ -struct red_sched_data -{ +struct red_sched_data { u32 limit; /* HARD maximal queue length */ unsigned char flags; struct red_parms parms; @@ -55,7 +54,7 @@ static inline int red_use_harddrop(struct red_sched_data *q) return q->flags & TC_RED_HARDDROP; } -static int red_enqueue(struct sk_buff *skb, struct Qdisc* sch) +static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch) { struct red_sched_data *q = qdisc_priv(sch); struct Qdisc *child = q->qdisc; @@ -67,34 +66,33 @@ static int red_enqueue(struct sk_buff *skb, struct Qdisc* sch) red_end_of_idle_period(&q->parms); switch (red_action(&q->parms, q->parms.qavg)) { - case RED_DONT_MARK: - break; - - case RED_PROB_MARK: - sch->qstats.overlimits++; - if (!red_use_ecn(q) || !INET_ECN_set_ce(skb)) { - q->stats.prob_drop++; - goto congestion_drop; - } - - q->stats.prob_mark++; - break; - - case RED_HARD_MARK: - sch->qstats.overlimits++; - if (red_use_harddrop(q) || !red_use_ecn(q) || - !INET_ECN_set_ce(skb)) { - q->stats.forced_drop++; - goto congestion_drop; - } - - q->stats.forced_mark++; - break; + case RED_DONT_MARK: + break; + + case RED_PROB_MARK: + sch->qstats.overlimits++; + if (!red_use_ecn(q) || !INET_ECN_set_ce(skb)) { + q->stats.prob_drop++; + goto congestion_drop; + } + + q->stats.prob_mark++; + break; + + case RED_HARD_MARK: + sch->qstats.overlimits++; + if (red_use_harddrop(q) || !red_use_ecn(q) || + !INET_ECN_set_ce(skb)) { + q->stats.forced_drop++; + goto congestion_drop; + } + + q->stats.forced_mark++; + break; } ret = qdisc_enqueue(skb, child); if (likely(ret == NET_XMIT_SUCCESS)) { - qdisc_bstats_update(sch, skb); sch->q.qlen++; } else if (net_xmit_drop_count(ret)) { q->stats.pdrop++; @@ -107,22 +105,24 @@ congestion_drop: return NET_XMIT_CN; } -static struct sk_buff * red_dequeue(struct Qdisc* sch) +static struct sk_buff *red_dequeue(struct Qdisc *sch) { struct sk_buff *skb; struct red_sched_data *q = qdisc_priv(sch); struct Qdisc *child = q->qdisc; skb = child->dequeue(child); - if (skb) + if (skb) { + qdisc_bstats_update(sch, skb); sch->q.qlen--; - else if (!red_is_idling(&q->parms)) - red_start_of_idle_period(&q->parms); - + } else { + if (!red_is_idling(&q->parms)) + red_start_of_idle_period(&q->parms); + } return skb; } -static struct sk_buff * red_peek(struct Qdisc* sch) +static struct sk_buff *red_peek(struct Qdisc *sch) { struct red_sched_data *q = qdisc_priv(sch); struct Qdisc *child = q->qdisc; @@ -130,7 +130,7 @@ static struct sk_buff * red_peek(struct Qdisc* sch) return child->ops->peek(child); } -static unsigned int red_drop(struct Qdisc* sch) +static unsigned int red_drop(struct Qdisc *sch) { struct red_sched_data *q = qdisc_priv(sch); struct Qdisc *child = q->qdisc; @@ -149,7 +149,7 @@ static unsigned int red_drop(struct Qdisc* sch) return 0; } -static void red_reset(struct Qdisc* sch) +static void red_reset(struct Qdisc *sch) { struct red_sched_data *q = qdisc_priv(sch); @@ -216,7 +216,7 @@ static int red_change(struct Qdisc *sch, struct nlattr *opt) return 0; } -static int red_init(struct Qdisc* sch, struct nlattr *opt) +static int red_init(struct Qdisc *sch, struct nlattr *opt) { struct red_sched_data *q = qdisc_priv(sch); diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index 239ec53a634..4cff4423577 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -21,6 +21,7 @@ #include <linux/skbuff.h> #include <linux/jhash.h> #include <linux/slab.h> +#include <linux/vmalloc.h> #include <net/ip.h> #include <net/netlink.h> #include <net/pkt_sched.h> @@ -76,7 +77,8 @@ #define SFQ_DEPTH 128 /* max number of packets per flow */ #define SFQ_SLOTS 128 /* max number of flows */ #define SFQ_EMPTY_SLOT 255 -#define SFQ_HASH_DIVISOR 1024 +#define SFQ_DEFAULT_HASH_DIVISOR 1024 + /* We use 16 bits to store allot, and want to handle packets up to 64K * Scale allot by 8 (1<<3) so that no overflow occurs. */ @@ -92,8 +94,7 @@ typedef unsigned char sfq_index; * while following values [SFQ_SLOTS ... SFQ_SLOTS + SFQ_DEPTH - 1] * are 'pointers' to dep[] array */ -struct sfq_head -{ +struct sfq_head { sfq_index next; sfq_index prev; }; @@ -108,13 +109,12 @@ struct sfq_slot { short allot; /* credit for this slot */ }; -struct sfq_sched_data -{ +struct sfq_sched_data { /* Parameters */ int perturb_period; - unsigned quantum; /* Allotment per round: MUST BE >= MTU */ + unsigned int quantum; /* Allotment per round: MUST BE >= MTU */ int limit; - + unsigned int divisor; /* number of slots in hash table */ /* Variables */ struct tcf_proto *filter_list; struct timer_list perturb_timer; @@ -122,7 +122,7 @@ struct sfq_sched_data sfq_index cur_depth; /* depth of longest slot */ unsigned short scaled_quantum; /* SFQ_ALLOT_SIZE(quantum) */ struct sfq_slot *tail; /* current slot in round */ - sfq_index ht[SFQ_HASH_DIVISOR]; /* Hash table */ + sfq_index *ht; /* Hash table (divisor slots) */ struct sfq_slot slots[SFQ_SLOTS]; struct sfq_head dep[SFQ_DEPTH]; /* Linked list of slots, indexed by depth */ }; @@ -137,12 +137,12 @@ static inline struct sfq_head *sfq_dep_head(struct sfq_sched_data *q, sfq_index return &q->dep[val - SFQ_SLOTS]; } -static __inline__ unsigned sfq_fold_hash(struct sfq_sched_data *q, u32 h, u32 h1) +static unsigned int sfq_fold_hash(struct sfq_sched_data *q, u32 h, u32 h1) { - return jhash_2words(h, h1, q->perturbation) & (SFQ_HASH_DIVISOR - 1); + return jhash_2words(h, h1, q->perturbation) & (q->divisor - 1); } -static unsigned sfq_hash(struct sfq_sched_data *q, struct sk_buff *skb) +static unsigned int sfq_hash(struct sfq_sched_data *q, struct sk_buff *skb) { u32 h, h2; @@ -157,13 +157,13 @@ static unsigned sfq_hash(struct sfq_sched_data *q, struct sk_buff *skb) iph = ip_hdr(skb); h = (__force u32)iph->daddr; h2 = (__force u32)iph->saddr ^ iph->protocol; - if (iph->frag_off & htons(IP_MF|IP_OFFSET)) + if (iph->frag_off & htons(IP_MF | IP_OFFSET)) break; poff = proto_ports_offset(iph->protocol); if (poff >= 0 && pskb_network_may_pull(skb, iph->ihl * 4 + 4 + poff)) { iph = ip_hdr(skb); - h2 ^= *(u32*)((void *)iph + iph->ihl * 4 + poff); + h2 ^= *(u32 *)((void *)iph + iph->ihl * 4 + poff); } break; } @@ -181,7 +181,7 @@ static unsigned sfq_hash(struct sfq_sched_data *q, struct sk_buff *skb) if (poff >= 0 && pskb_network_may_pull(skb, sizeof(*iph) + 4 + poff)) { iph = ipv6_hdr(skb); - h2 ^= *(u32*)((void *)iph + sizeof(*iph) + poff); + h2 ^= *(u32 *)((void *)iph + sizeof(*iph) + poff); } break; } @@ -203,7 +203,7 @@ static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch, if (TC_H_MAJ(skb->priority) == sch->handle && TC_H_MIN(skb->priority) > 0 && - TC_H_MIN(skb->priority) <= SFQ_HASH_DIVISOR) + TC_H_MIN(skb->priority) <= q->divisor) return TC_H_MIN(skb->priority); if (!q->filter_list) @@ -221,7 +221,7 @@ static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch, return 0; } #endif - if (TC_H_MIN(res.classid) <= SFQ_HASH_DIVISOR) + if (TC_H_MIN(res.classid) <= q->divisor) return TC_H_MIN(res.classid); } return 0; @@ -402,10 +402,8 @@ sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch) q->tail = slot; slot->allot = q->scaled_quantum; } - if (++sch->q.qlen <= q->limit) { - qdisc_bstats_update(sch, skb); + if (++sch->q.qlen <= q->limit) return NET_XMIT_SUCCESS; - } sfq_drop(sch); return NET_XMIT_CN; @@ -445,6 +443,7 @@ next_slot: } skb = slot_dequeue_head(slot); sfq_dec(q, a); + qdisc_bstats_update(sch, skb); sch->q.qlen--; sch->qstats.backlog -= qdisc_pkt_len(skb); @@ -498,7 +497,11 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt) q->perturb_period = ctl->perturb_period * HZ; if (ctl->limit) q->limit = min_t(u32, ctl->limit, SFQ_DEPTH - 1); - + if (ctl->divisor) { + if (!is_power_of_2(ctl->divisor) || ctl->divisor > 65536) + return -EINVAL; + q->divisor = ctl->divisor; + } qlen = sch->q.qlen; while (sch->q.qlen > q->limit) sfq_drop(sch); @@ -516,15 +519,13 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt) static int sfq_init(struct Qdisc *sch, struct nlattr *opt) { struct sfq_sched_data *q = qdisc_priv(sch); + size_t sz; int i; q->perturb_timer.function = sfq_perturbation; q->perturb_timer.data = (unsigned long)sch; init_timer_deferrable(&q->perturb_timer); - for (i = 0; i < SFQ_HASH_DIVISOR; i++) - q->ht[i] = SFQ_EMPTY_SLOT; - for (i = 0; i < SFQ_DEPTH; i++) { q->dep[i].next = i + SFQ_SLOTS; q->dep[i].prev = i + SFQ_SLOTS; @@ -533,6 +534,7 @@ static int sfq_init(struct Qdisc *sch, struct nlattr *opt) q->limit = SFQ_DEPTH - 1; q->cur_depth = 0; q->tail = NULL; + q->divisor = SFQ_DEFAULT_HASH_DIVISOR; if (opt == NULL) { q->quantum = psched_mtu(qdisc_dev(sch)); q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum); @@ -544,10 +546,23 @@ static int sfq_init(struct Qdisc *sch, struct nlattr *opt) return err; } + sz = sizeof(q->ht[0]) * q->divisor; + q->ht = kmalloc(sz, GFP_KERNEL); + if (!q->ht && sz > PAGE_SIZE) + q->ht = vmalloc(sz); + if (!q->ht) + return -ENOMEM; + for (i = 0; i < q->divisor; i++) + q->ht[i] = SFQ_EMPTY_SLOT; + for (i = 0; i < SFQ_SLOTS; i++) { slot_queue_init(&q->slots[i]); sfq_link(q, i); } + if (q->limit >= 1) + sch->flags |= TCQ_F_CAN_BYPASS; + else + sch->flags &= ~TCQ_F_CAN_BYPASS; return 0; } @@ -558,6 +573,10 @@ static void sfq_destroy(struct Qdisc *sch) tcf_destroy_chain(&q->filter_list); q->perturb_period = 0; del_timer_sync(&q->perturb_timer); + if (is_vmalloc_addr(q->ht)) + vfree(q->ht); + else + kfree(q->ht); } static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb) @@ -570,7 +589,7 @@ static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb) opt.perturb_period = q->perturb_period / HZ; opt.limit = q->limit; - opt.divisor = SFQ_HASH_DIVISOR; + opt.divisor = q->divisor; opt.flows = q->limit; NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); @@ -595,6 +614,8 @@ static unsigned long sfq_get(struct Qdisc *sch, u32 classid) static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent, u32 classid) { + /* we cannot bypass queue discipline anymore */ + sch->flags &= ~TCQ_F_CAN_BYPASS; return 0; } @@ -648,7 +669,7 @@ static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg) if (arg->stop) return; - for (i = 0; i < SFQ_HASH_DIVISOR; i++) { + for (i = 0; i < q->divisor; i++) { if (q->ht[i] == SFQ_EMPTY_SLOT || arg->count < arg->skip) { arg->count++; diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c index 77565e72181..1dcfb5223a8 100644 --- a/net/sched/sch_tbf.c +++ b/net/sched/sch_tbf.c @@ -97,8 +97,7 @@ changed the limit is not effective anymore. */ -struct tbf_sched_data -{ +struct tbf_sched_data { /* Parameters */ u32 limit; /* Maximal length of backlog: bytes */ u32 buffer; /* Token bucket depth/rate: MUST BE >= MTU/B */ @@ -115,10 +114,10 @@ struct tbf_sched_data struct qdisc_watchdog watchdog; /* Watchdog timer */ }; -#define L2T(q,L) qdisc_l2t((q)->R_tab,L) -#define L2T_P(q,L) qdisc_l2t((q)->P_tab,L) +#define L2T(q, L) qdisc_l2t((q)->R_tab, L) +#define L2T_P(q, L) qdisc_l2t((q)->P_tab, L) -static int tbf_enqueue(struct sk_buff *skb, struct Qdisc* sch) +static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch) { struct tbf_sched_data *q = qdisc_priv(sch); int ret; @@ -134,11 +133,10 @@ static int tbf_enqueue(struct sk_buff *skb, struct Qdisc* sch) } sch->q.qlen++; - qdisc_bstats_update(sch, skb); return NET_XMIT_SUCCESS; } -static unsigned int tbf_drop(struct Qdisc* sch) +static unsigned int tbf_drop(struct Qdisc *sch) { struct tbf_sched_data *q = qdisc_priv(sch); unsigned int len = 0; @@ -150,7 +148,7 @@ static unsigned int tbf_drop(struct Qdisc* sch) return len; } -static struct sk_buff *tbf_dequeue(struct Qdisc* sch) +static struct sk_buff *tbf_dequeue(struct Qdisc *sch) { struct tbf_sched_data *q = qdisc_priv(sch); struct sk_buff *skb; @@ -186,7 +184,8 @@ static struct sk_buff *tbf_dequeue(struct Qdisc* sch) q->tokens = toks; q->ptokens = ptoks; sch->q.qlen--; - sch->flags &= ~TCQ_F_THROTTLED; + qdisc_unthrottled(sch); + qdisc_bstats_update(sch, skb); return skb; } @@ -209,7 +208,7 @@ static struct sk_buff *tbf_dequeue(struct Qdisc* sch) return NULL; } -static void tbf_reset(struct Qdisc* sch) +static void tbf_reset(struct Qdisc *sch) { struct tbf_sched_data *q = qdisc_priv(sch); @@ -227,7 +226,7 @@ static const struct nla_policy tbf_policy[TCA_TBF_MAX + 1] = { [TCA_TBF_PTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE }, }; -static int tbf_change(struct Qdisc* sch, struct nlattr *opt) +static int tbf_change(struct Qdisc *sch, struct nlattr *opt) { int err; struct tbf_sched_data *q = qdisc_priv(sch); @@ -236,7 +235,7 @@ static int tbf_change(struct Qdisc* sch, struct nlattr *opt) struct qdisc_rate_table *rtab = NULL; struct qdisc_rate_table *ptab = NULL; struct Qdisc *child = NULL; - int max_size,n; + int max_size, n; err = nla_parse_nested(tb, TCA_TBF_PTAB, opt, tbf_policy); if (err < 0) @@ -259,15 +258,18 @@ static int tbf_change(struct Qdisc* sch, struct nlattr *opt) } for (n = 0; n < 256; n++) - if (rtab->data[n] > qopt->buffer) break; - max_size = (n << qopt->rate.cell_log)-1; + if (rtab->data[n] > qopt->buffer) + break; + max_size = (n << qopt->rate.cell_log) - 1; if (ptab) { int size; for (n = 0; n < 256; n++) - if (ptab->data[n] > qopt->mtu) break; - size = (n << qopt->peakrate.cell_log)-1; - if (size < max_size) max_size = size; + if (ptab->data[n] > qopt->mtu) + break; + size = (n << qopt->peakrate.cell_log) - 1; + if (size < max_size) + max_size = size; } if (max_size < 0) goto done; @@ -310,7 +312,7 @@ done: return err; } -static int tbf_init(struct Qdisc* sch, struct nlattr *opt) +static int tbf_init(struct Qdisc *sch, struct nlattr *opt) { struct tbf_sched_data *q = qdisc_priv(sch); @@ -422,8 +424,7 @@ static void tbf_walk(struct Qdisc *sch, struct qdisc_walker *walker) } } -static const struct Qdisc_class_ops tbf_class_ops = -{ +static const struct Qdisc_class_ops tbf_class_ops = { .graft = tbf_graft, .leaf = tbf_leaf, .get = tbf_get, diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c index 84ce48eadff..45cd30098e3 100644 --- a/net/sched/sch_teql.c +++ b/net/sched/sch_teql.c @@ -53,8 +53,7 @@ which will not break load balancing, though native slave traffic will have the highest priority. */ -struct teql_master -{ +struct teql_master { struct Qdisc_ops qops; struct net_device *dev; struct Qdisc *slaves; @@ -65,29 +64,27 @@ struct teql_master unsigned long tx_dropped; }; -struct teql_sched_data -{ +struct teql_sched_data { struct Qdisc *next; struct teql_master *m; struct neighbour *ncache; struct sk_buff_head q; }; -#define NEXT_SLAVE(q) (((struct teql_sched_data*)qdisc_priv(q))->next) +#define NEXT_SLAVE(q) (((struct teql_sched_data *)qdisc_priv(q))->next) -#define FMASK (IFF_BROADCAST|IFF_POINTOPOINT) +#define FMASK (IFF_BROADCAST | IFF_POINTOPOINT) /* "teql*" qdisc routines */ static int -teql_enqueue(struct sk_buff *skb, struct Qdisc* sch) +teql_enqueue(struct sk_buff *skb, struct Qdisc *sch) { struct net_device *dev = qdisc_dev(sch); struct teql_sched_data *q = qdisc_priv(sch); if (q->q.qlen < dev->tx_queue_len) { __skb_queue_tail(&q->q, skb); - qdisc_bstats_update(sch, skb); return NET_XMIT_SUCCESS; } @@ -97,7 +94,7 @@ teql_enqueue(struct sk_buff *skb, struct Qdisc* sch) } static struct sk_buff * -teql_dequeue(struct Qdisc* sch) +teql_dequeue(struct Qdisc *sch) { struct teql_sched_data *dat = qdisc_priv(sch); struct netdev_queue *dat_queue; @@ -111,19 +108,21 @@ teql_dequeue(struct Qdisc* sch) dat->m->slaves = sch; netif_wake_queue(m); } + } else { + qdisc_bstats_update(sch, skb); } sch->q.qlen = dat->q.qlen + dat_queue->qdisc->q.qlen; return skb; } static struct sk_buff * -teql_peek(struct Qdisc* sch) +teql_peek(struct Qdisc *sch) { /* teql is meant to be used as root qdisc */ return NULL; } -static __inline__ void +static inline void teql_neigh_release(struct neighbour *n) { if (n) @@ -131,7 +130,7 @@ teql_neigh_release(struct neighbour *n) } static void -teql_reset(struct Qdisc* sch) +teql_reset(struct Qdisc *sch) { struct teql_sched_data *dat = qdisc_priv(sch); @@ -141,13 +140,14 @@ teql_reset(struct Qdisc* sch) } static void -teql_destroy(struct Qdisc* sch) +teql_destroy(struct Qdisc *sch) { struct Qdisc *q, *prev; struct teql_sched_data *dat = qdisc_priv(sch); struct teql_master *master = dat->m; - if ((prev = master->slaves) != NULL) { + prev = master->slaves; + if (prev) { do { q = NEXT_SLAVE(prev); if (q == sch) { @@ -179,7 +179,7 @@ teql_destroy(struct Qdisc* sch) static int teql_qdisc_init(struct Qdisc *sch, struct nlattr *opt) { struct net_device *dev = qdisc_dev(sch); - struct teql_master *m = (struct teql_master*)sch->ops; + struct teql_master *m = (struct teql_master *)sch->ops; struct teql_sched_data *q = qdisc_priv(sch); if (dev->hard_header_len > m->dev->hard_header_len) @@ -290,7 +290,8 @@ restart: nores = 0; busy = 0; - if ((q = start) == NULL) + q = start; + if (!q) goto drop; do { @@ -355,10 +356,10 @@ drop: static int teql_master_open(struct net_device *dev) { - struct Qdisc * q; + struct Qdisc *q; struct teql_master *m = netdev_priv(dev); int mtu = 0xFFFE; - unsigned flags = IFF_NOARP|IFF_MULTICAST; + unsigned int flags = IFF_NOARP | IFF_MULTICAST; if (m->slaves == NULL) return -EUNATCH; @@ -426,7 +427,7 @@ static int teql_master_mtu(struct net_device *dev, int new_mtu) do { if (new_mtu > qdisc_dev(q)->mtu) return -EINVAL; - } while ((q=NEXT_SLAVE(q)) != m->slaves); + } while ((q = NEXT_SLAVE(q)) != m->slaves); } dev->mtu = new_mtu; diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index dd419d28620..d8d98d5b508 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1475,6 +1475,12 @@ restart: goto out_free; } + if (sk_filter(other, skb) < 0) { + /* Toss the packet but do not return any error to the sender */ + err = len; + goto out_free; + } + unix_state_lock(other); err = -EPERM; if (!unix_may_send(sk, other)) @@ -1978,36 +1984,38 @@ static int unix_shutdown(struct socket *sock, int mode) mode = (mode+1)&(RCV_SHUTDOWN|SEND_SHUTDOWN); - if (mode) { - unix_state_lock(sk); - sk->sk_shutdown |= mode; - other = unix_peer(sk); - if (other) - sock_hold(other); - unix_state_unlock(sk); - sk->sk_state_change(sk); - - if (other && - (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) { - - int peer_mode = 0; - - if (mode&RCV_SHUTDOWN) - peer_mode |= SEND_SHUTDOWN; - if (mode&SEND_SHUTDOWN) - peer_mode |= RCV_SHUTDOWN; - unix_state_lock(other); - other->sk_shutdown |= peer_mode; - unix_state_unlock(other); - other->sk_state_change(other); - if (peer_mode == SHUTDOWN_MASK) - sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP); - else if (peer_mode & RCV_SHUTDOWN) - sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN); - } - if (other) - sock_put(other); + if (!mode) + return 0; + + unix_state_lock(sk); + sk->sk_shutdown |= mode; + other = unix_peer(sk); + if (other) + sock_hold(other); + unix_state_unlock(sk); + sk->sk_state_change(sk); + + if (other && + (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) { + + int peer_mode = 0; + + if (mode&RCV_SHUTDOWN) + peer_mode |= SEND_SHUTDOWN; + if (mode&SEND_SHUTDOWN) + peer_mode |= RCV_SHUTDOWN; + unix_state_lock(other); + other->sk_shutdown |= peer_mode; + unix_state_unlock(other); + other->sk_state_change(other); + if (peer_mode == SHUTDOWN_MASK) + sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP); + else if (peer_mode & RCV_SHUTDOWN) + sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN); } + if (other) + sock_put(other); + return 0; } diff --git a/net/wanrouter/wanmain.c b/net/wanrouter/wanmain.c index 74944a2dd43..788a12c1eb5 100644 --- a/net/wanrouter/wanmain.c +++ b/net/wanrouter/wanmain.c @@ -59,8 +59,6 @@ #include <asm/uaccess.h> /* copy_to/from_user */ #include <linux/init.h> /* __initfunc et al. */ -#define KMEM_SAFETYZONE 8 - #define DEV_TO_SLAVE(dev) (*((struct net_device **)netdev_priv(dev))) /* diff --git a/security/keys/Makefile b/security/keys/Makefile index 6c941050f57..1bf090a885f 100644 --- a/security/keys/Makefile +++ b/security/keys/Makefile @@ -13,8 +13,8 @@ obj-y := \ request_key_auth.o \ user_defined.o -obj-$(CONFIG_TRUSTED_KEYS) += trusted_defined.o -obj-$(CONFIG_ENCRYPTED_KEYS) += encrypted_defined.o +obj-$(CONFIG_TRUSTED_KEYS) += trusted.o +obj-$(CONFIG_ENCRYPTED_KEYS) += encrypted.o obj-$(CONFIG_KEYS_COMPAT) += compat.o obj-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_SYSCTL) += sysctl.o diff --git a/security/keys/encrypted_defined.c b/security/keys/encrypted.c index 28791a65740..9e7e4ce3fae 100644 --- a/security/keys/encrypted_defined.c +++ b/security/keys/encrypted.c @@ -30,7 +30,7 @@ #include <crypto/sha.h> #include <crypto/aes.h> -#include "encrypted_defined.h" +#include "encrypted.h" static const char KEY_TRUSTED_PREFIX[] = "trusted:"; static const char KEY_USER_PREFIX[] = "user:"; @@ -888,6 +888,7 @@ static int __init init_encrypted(void) out: encrypted_shash_release(); return ret; + } static void __exit cleanup_encrypted(void) diff --git a/security/keys/encrypted_defined.h b/security/keys/encrypted.h index cef5e2f2b7d..cef5e2f2b7d 100644 --- a/security/keys/encrypted_defined.h +++ b/security/keys/encrypted.h diff --git a/security/keys/trusted_defined.c b/security/keys/trusted.c index 2836c6dc18a..83fc92e297c 100644 --- a/security/keys/trusted_defined.c +++ b/security/keys/trusted.c @@ -29,7 +29,7 @@ #include <linux/tpm.h> #include <linux/tpm_command.h> -#include "trusted_defined.h" +#include "trusted.h" static const char hmac_alg[] = "hmac(sha1)"; static const char hash_alg[] = "sha1"; @@ -1032,6 +1032,7 @@ static int trusted_update(struct key *key, const void *data, size_t datalen) ret = datablob_parse(datablob, new_p, new_o); if (ret != Opt_update) { ret = -EINVAL; + kfree(new_p); goto out; } /* copy old key values, and reseal with new pcrs */ diff --git a/security/keys/trusted_defined.h b/security/keys/trusted.h index 3249fbd2b65..3249fbd2b65 100644 --- a/security/keys/trusted_defined.h +++ b/security/keys/trusted.h diff --git a/security/selinux/ss/conditional.c b/security/selinux/ss/conditional.c index c3f845cbcd4..a53373207fb 100644 --- a/security/selinux/ss/conditional.c +++ b/security/selinux/ss/conditional.c @@ -178,7 +178,7 @@ int cond_init_bool_indexes(struct policydb *p) p->bool_val_to_struct = (struct cond_bool_datum **) kmalloc(p->p_bools.nprim * sizeof(struct cond_bool_datum *), GFP_KERNEL); if (!p->bool_val_to_struct) - return -1; + return -ENOMEM; return 0; } diff --git a/security/selinux/ss/policydb.c b/security/selinux/ss/policydb.c index be9de387283..57363562f0f 100644 --- a/security/selinux/ss/policydb.c +++ b/security/selinux/ss/policydb.c @@ -501,8 +501,8 @@ static int policydb_index(struct policydb *p) if (rc) goto out; - rc = -ENOMEM; - if (cond_init_bool_indexes(p)) + rc = cond_init_bool_indexes(p); + if (rc) goto out; for (i = 0; i < SYM_NUM; i++) { diff --git a/tools/perf/Makefile b/tools/perf/Makefile index 2b5387d53ba..7141c42e146 100644 --- a/tools/perf/Makefile +++ b/tools/perf/Makefile @@ -204,13 +204,11 @@ EXTRA_WARNINGS := $(EXTRA_WARNINGS) -Wshadow EXTRA_WARNINGS := $(EXTRA_WARNINGS) -Winit-self EXTRA_WARNINGS := $(EXTRA_WARNINGS) -Wpacked EXTRA_WARNINGS := $(EXTRA_WARNINGS) -Wredundant-decls -EXTRA_WARNINGS := $(EXTRA_WARNINGS) -Wstack-protector EXTRA_WARNINGS := $(EXTRA_WARNINGS) -Wstrict-aliasing=3 EXTRA_WARNINGS := $(EXTRA_WARNINGS) -Wswitch-default EXTRA_WARNINGS := $(EXTRA_WARNINGS) -Wswitch-enum EXTRA_WARNINGS := $(EXTRA_WARNINGS) -Wno-system-headers EXTRA_WARNINGS := $(EXTRA_WARNINGS) -Wundef -EXTRA_WARNINGS := $(EXTRA_WARNINGS) -Wvolatile-register-var EXTRA_WARNINGS := $(EXTRA_WARNINGS) -Wwrite-strings EXTRA_WARNINGS := $(EXTRA_WARNINGS) -Wbad-function-cast EXTRA_WARNINGS := $(EXTRA_WARNINGS) -Wmissing-declarations @@ -294,6 +292,13 @@ ifeq ($(call try-cc,$(SOURCE_HELLO),-Werror -fstack-protector-all),y) CFLAGS := $(CFLAGS) -fstack-protector-all endif +ifeq ($(call try-cc,$(SOURCE_HELLO),-Werror -Wstack-protector),y) + CFLAGS := $(CFLAGS) -Wstack-protector +endif + +ifeq ($(call try-cc,$(SOURCE_HELLO),-Werror -Wvolatile-register-var),y) + CFLAGS := $(CFLAGS) -Wvolatile-register-var +endif ### --- END CONFIGURATION SECTION --- diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c index c056cdc0691..8879463807e 100644 --- a/tools/perf/builtin-annotate.c +++ b/tools/perf/builtin-annotate.c @@ -212,7 +212,7 @@ get_source_line(struct hist_entry *he, int len, const char *filename) continue; offset = start + i; - sprintf(cmd, "addr2line -e %s %016llx", filename, offset); + sprintf(cmd, "addr2line -e %s %016" PRIx64, filename, offset); fp = popen(cmd, "r"); if (!fp) continue; @@ -270,9 +270,9 @@ static void hist_entry__print_hits(struct hist_entry *self) for (offset = 0; offset < len; ++offset) if (h->ip[offset] != 0) - printf("%*Lx: %Lu\n", BITS_PER_LONG / 2, + printf("%*" PRIx64 ": %" PRIu64 "\n", BITS_PER_LONG / 2, sym->start + offset, h->ip[offset]); - printf("%*s: %Lu\n", BITS_PER_LONG / 2, "h->sum", h->sum); + printf("%*s: %" PRIu64 "\n", BITS_PER_LONG / 2, "h->sum", h->sum); } static int hist_entry__tty_annotate(struct hist_entry *he) diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c index def7ddc2fd4..d97256d6598 100644 --- a/tools/perf/builtin-kmem.c +++ b/tools/perf/builtin-kmem.c @@ -371,10 +371,10 @@ static void __print_result(struct rb_root *root, struct perf_session *session, addr = data->ptr; if (sym != NULL) - snprintf(buf, sizeof(buf), "%s+%Lx", sym->name, + snprintf(buf, sizeof(buf), "%s+%" PRIx64 "", sym->name, addr - map->unmap_ip(map, sym->start)); else - snprintf(buf, sizeof(buf), "%#Lx", addr); + snprintf(buf, sizeof(buf), "%#" PRIx64 "", addr); printf(" %-34s |", buf); printf(" %9llu/%-5lu | %9llu/%-5lu | %8lu | %8lu | %6.3f%%\n", diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c index b9c6e543297..2b36defc5d7 100644 --- a/tools/perf/builtin-lock.c +++ b/tools/perf/builtin-lock.c @@ -782,9 +782,9 @@ static void print_result(void) pr_info("%10u ", st->nr_acquired); pr_info("%10u ", st->nr_contended); - pr_info("%15llu ", st->wait_time_total); - pr_info("%15llu ", st->wait_time_max); - pr_info("%15llu ", st->wait_time_min == ULLONG_MAX ? + pr_info("%15" PRIu64 " ", st->wait_time_total); + pr_info("%15" PRIu64 " ", st->wait_time_max); + pr_info("%15" PRIu64 " ", st->wait_time_min == ULLONG_MAX ? 0 : st->wait_time_min); pr_info("\n"); } diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index fcd29e8af29..b2f729fdb31 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -817,7 +817,7 @@ static int __cmd_record(int argc, const char **argv) * Approximate RIP event size: 24 bytes. */ fprintf(stderr, - "[ perf record: Captured and wrote %.3f MB %s (~%lld samples) ]\n", + "[ perf record: Captured and wrote %.3f MB %s (~%" PRIu64 " samples) ]\n", (double)bytes_written / 1024.0 / 1024.0, output_name, bytes_written / 24); diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index 75183a4518e..c27e31f289e 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -197,7 +197,7 @@ static int process_read_event(event_t *event, struct sample_data *sample __used, event->read.value); } - dump_printf(": %d %d %s %Lu\n", event->read.pid, event->read.tid, + dump_printf(": %d %d %s %" PRIu64 "\n", event->read.pid, event->read.tid, attr ? __event_name(attr->type, attr->config) : "FAIL", event->read.value); diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index 29e7ffd8569..29acb894e03 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -193,7 +193,7 @@ static void calibrate_run_measurement_overhead(void) } run_measurement_overhead = min_delta; - printf("run measurement overhead: %Ld nsecs\n", min_delta); + printf("run measurement overhead: %" PRIu64 " nsecs\n", min_delta); } static void calibrate_sleep_measurement_overhead(void) @@ -211,7 +211,7 @@ static void calibrate_sleep_measurement_overhead(void) min_delta -= 10000; sleep_measurement_overhead = min_delta; - printf("sleep measurement overhead: %Ld nsecs\n", min_delta); + printf("sleep measurement overhead: %" PRIu64 " nsecs\n", min_delta); } static struct sched_atom * @@ -617,13 +617,13 @@ static void test_calibrations(void) burn_nsecs(1e6); T1 = get_nsecs(); - printf("the run test took %Ld nsecs\n", T1-T0); + printf("the run test took %" PRIu64 " nsecs\n", T1 - T0); T0 = get_nsecs(); sleep_nsecs(1e6); T1 = get_nsecs(); - printf("the sleep test took %Ld nsecs\n", T1-T0); + printf("the sleep test took %" PRIu64 " nsecs\n", T1 - T0); } #define FILL_FIELD(ptr, field, event, data) \ @@ -816,10 +816,10 @@ replay_switch_event(struct trace_switch_event *switch_event, delta = 0; if (delta < 0) - die("hm, delta: %Ld < 0 ?\n", delta); + die("hm, delta: %" PRIu64 " < 0 ?\n", delta); if (verbose) { - printf(" ... switch from %s/%d to %s/%d [ran %Ld nsecs]\n", + printf(" ... switch from %s/%d to %s/%d [ran %" PRIu64 " nsecs]\n", switch_event->prev_comm, switch_event->prev_pid, switch_event->next_comm, switch_event->next_pid, delta); @@ -1048,7 +1048,7 @@ latency_switch_event(struct trace_switch_event *switch_event, delta = 0; if (delta < 0) - die("hm, delta: %Ld < 0 ?\n", delta); + die("hm, delta: %" PRIu64 " < 0 ?\n", delta); sched_out = perf_session__findnew(session, switch_event->prev_pid); @@ -1221,7 +1221,7 @@ static void output_lat_thread(struct work_atoms *work_list) avg = work_list->total_lat / work_list->nb_atoms; - printf("|%11.3f ms |%9llu | avg:%9.3f ms | max:%9.3f ms | max at: %9.6f s\n", + printf("|%11.3f ms |%9" PRIu64 " | avg:%9.3f ms | max:%9.3f ms | max at: %9.6f s\n", (double)work_list->total_runtime / 1e6, work_list->nb_atoms, (double)avg / 1e6, (double)work_list->max_lat / 1e6, @@ -1423,7 +1423,7 @@ map_switch_event(struct trace_switch_event *switch_event, delta = 0; if (delta < 0) - die("hm, delta: %Ld < 0 ?\n", delta); + die("hm, delta: %" PRIu64 " < 0 ?\n", delta); sched_out = perf_session__findnew(session, switch_event->prev_pid); @@ -1713,7 +1713,7 @@ static void __cmd_lat(void) } printf(" -----------------------------------------------------------------------------------------\n"); - printf(" TOTAL: |%11.3f ms |%9Ld |\n", + printf(" TOTAL: |%11.3f ms |%9" PRIu64 " |\n", (double)all_runtime/1e6, all_count); printf(" ---------------------------------------------------\n"); diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index 150a606002e..b766c2a9ac9 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -77,8 +77,8 @@ static int process_sample_event(event_t *event, struct sample_data *sample, if (session->sample_type & PERF_SAMPLE_RAW) { if (debug_mode) { if (sample->time < last_timestamp) { - pr_err("Samples misordered, previous: %llu " - "this: %llu\n", last_timestamp, + pr_err("Samples misordered, previous: %" PRIu64 + " this: %" PRIu64 "\n", last_timestamp, sample->time); nr_unordered++; } @@ -126,7 +126,7 @@ static int __cmd_script(struct perf_session *session) ret = perf_session__process_events(session, &event_ops); if (debug_mode) - pr_err("Misordered timestamps: %llu\n", nr_unordered); + pr_err("Misordered timestamps: %" PRIu64 "\n", nr_unordered); return ret; } diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 0ff11d9b13b..a482a191a0c 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -206,8 +206,8 @@ static int read_counter_aggr(struct perf_evsel *counter) update_stats(&ps->res_stats[i], count[i]); if (verbose) { - fprintf(stderr, "%s: %Ld %Ld %Ld\n", event_name(counter), - count[0], count[1], count[2]); + fprintf(stderr, "%s: %" PRIu64 " %" PRIu64 " %" PRIu64 "\n", + event_name(counter), count[0], count[1], count[2]); } /* diff --git a/tools/perf/builtin-test.c b/tools/perf/builtin-test.c index ed5696198d3..5dcdba653d7 100644 --- a/tools/perf/builtin-test.c +++ b/tools/perf/builtin-test.c @@ -146,7 +146,7 @@ next_pair: if (llabs(skew) < page_size) continue; - pr_debug("%#Lx: diff end addr for %s v: %#Lx k: %#Lx\n", + pr_debug("%#" PRIx64 ": diff end addr for %s v: %#" PRIx64 " k: %#" PRIx64 "\n", sym->start, sym->name, sym->end, pair->end); } else { struct rb_node *nnd; @@ -168,11 +168,11 @@ detour: goto detour; } - pr_debug("%#Lx: diff name v: %s k: %s\n", + pr_debug("%#" PRIx64 ": diff name v: %s k: %s\n", sym->start, sym->name, pair->name); } } else - pr_debug("%#Lx: %s not on kallsyms\n", sym->start, sym->name); + pr_debug("%#" PRIx64 ": %s not on kallsyms\n", sym->start, sym->name); err = -1; } @@ -211,10 +211,10 @@ detour: if (pair->start == pos->start) { pair->priv = 1; - pr_info(" %Lx-%Lx %Lx %s in kallsyms as", + pr_info(" %" PRIx64 "-%" PRIx64 " %" PRIx64 " %s in kallsyms as", pos->start, pos->end, pos->pgoff, pos->dso->name); if (pos->pgoff != pair->pgoff || pos->end != pair->end) - pr_info(": \n*%Lx-%Lx %Lx", + pr_info(": \n*%" PRIx64 "-%" PRIx64 " %" PRIx64 "", pair->start, pair->end, pair->pgoff); pr_info(" %s\n", pair->dso->name); pair->priv = 1; @@ -307,7 +307,7 @@ static int test__open_syscall_event(void) } if (evsel->counts->cpu[0].val != nr_open_calls) { - pr_debug("perf_evsel__read_on_cpu: expected to intercept %d calls, got %Ld\n", + pr_debug("perf_evsel__read_on_cpu: expected to intercept %d calls, got %" PRIu64 "\n", nr_open_calls, evsel->counts->cpu[0].val); goto out_close_fd; } @@ -332,8 +332,7 @@ static int test__open_syscall_event_on_all_cpus(void) struct perf_evsel *evsel; struct perf_event_attr attr; unsigned int nr_open_calls = 111, i; - cpu_set_t *cpu_set; - size_t cpu_set_size; + cpu_set_t cpu_set; int id = trace_event__id("sys_enter_open"); if (id < 0) { @@ -353,13 +352,8 @@ static int test__open_syscall_event_on_all_cpus(void) return -1; } - cpu_set = CPU_ALLOC(cpus->nr); - if (cpu_set == NULL) - goto out_thread_map_delete; - - cpu_set_size = CPU_ALLOC_SIZE(cpus->nr); - CPU_ZERO_S(cpu_set_size, cpu_set); + CPU_ZERO(&cpu_set); memset(&attr, 0, sizeof(attr)); attr.type = PERF_TYPE_TRACEPOINT; @@ -367,7 +361,7 @@ static int test__open_syscall_event_on_all_cpus(void) evsel = perf_evsel__new(&attr, 0); if (evsel == NULL) { pr_debug("perf_evsel__new\n"); - goto out_cpu_free; + goto out_thread_map_delete; } if (perf_evsel__open(evsel, cpus, threads) < 0) { @@ -379,14 +373,29 @@ static int test__open_syscall_event_on_all_cpus(void) for (cpu = 0; cpu < cpus->nr; ++cpu) { unsigned int ncalls = nr_open_calls + cpu; + /* + * XXX eventually lift this restriction in a way that + * keeps perf building on older glibc installations + * without CPU_ALLOC. 1024 cpus in 2010 still seems + * a reasonable upper limit tho :-) + */ + if (cpus->map[cpu] >= CPU_SETSIZE) { + pr_debug("Ignoring CPU %d\n", cpus->map[cpu]); + continue; + } - CPU_SET(cpu, cpu_set); - sched_setaffinity(0, cpu_set_size, cpu_set); + CPU_SET(cpus->map[cpu], &cpu_set); + if (sched_setaffinity(0, sizeof(cpu_set), &cpu_set) < 0) { + pr_debug("sched_setaffinity() failed on CPU %d: %s ", + cpus->map[cpu], + strerror(errno)); + goto out_close_fd; + } for (i = 0; i < ncalls; ++i) { fd = open("/etc/passwd", O_RDONLY); close(fd); } - CPU_CLR(cpu, cpu_set); + CPU_CLR(cpus->map[cpu], &cpu_set); } /* @@ -402,6 +411,9 @@ static int test__open_syscall_event_on_all_cpus(void) for (cpu = 0; cpu < cpus->nr; ++cpu) { unsigned int expected; + if (cpus->map[cpu] >= CPU_SETSIZE) + continue; + if (perf_evsel__read_on_cpu(evsel, cpu, 0) < 0) { pr_debug("perf_evsel__open_read_on_cpu\n"); goto out_close_fd; @@ -409,8 +421,8 @@ static int test__open_syscall_event_on_all_cpus(void) expected = nr_open_calls + cpu; if (evsel->counts->cpu[cpu].val != expected) { - pr_debug("perf_evsel__read_on_cpu: expected to intercept %d calls on cpu %d, got %Ld\n", - expected, cpu, evsel->counts->cpu[cpu].val); + pr_debug("perf_evsel__read_on_cpu: expected to intercept %d calls on cpu %d, got %" PRIu64 "\n", + expected, cpus->map[cpu], evsel->counts->cpu[cpu].val); goto out_close_fd; } } @@ -420,8 +432,6 @@ out_close_fd: perf_evsel__close_fd(evsel, 1, threads->nr); out_evsel_delete: perf_evsel__delete(evsel); -out_cpu_free: - CPU_FREE(cpu_set); out_thread_map_delete: thread_map__delete(threads); return err; diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index 05344c6210a..b6998e05576 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -40,6 +40,7 @@ #include <stdio.h> #include <termios.h> #include <unistd.h> +#include <inttypes.h> #include <errno.h> #include <time.h> @@ -214,7 +215,7 @@ static int parse_source(struct sym_entry *syme) len = sym->end - sym->start; sprintf(command, - "objdump --start-address=%#0*Lx --stop-address=%#0*Lx -dS %s", + "objdump --start-address=%#0*" PRIx64 " --stop-address=%#0*" PRIx64 " -dS %s", BITS_PER_LONG / 4, map__rip_2objdump(map, sym->start), BITS_PER_LONG / 4, map__rip_2objdump(map, sym->end), path); @@ -308,7 +309,7 @@ static void lookup_sym_source(struct sym_entry *syme) struct source_line *line; char pattern[PATTERN_LEN + 1]; - sprintf(pattern, "%0*Lx <", BITS_PER_LONG / 4, + sprintf(pattern, "%0*" PRIx64 " <", BITS_PER_LONG / 4, map__rip_2objdump(syme->map, symbol->start)); pthread_mutex_lock(&syme->src->lock); @@ -537,7 +538,7 @@ static void print_sym_table(void) if (nr_counters == 1 || !display_weighted) { struct perf_evsel *first; first = list_entry(evsel_list.next, struct perf_evsel, node); - printf("%Ld", first->attr.sample_period); + printf("%" PRIu64, (uint64_t)first->attr.sample_period); if (freq) printf("Hz "); else @@ -640,7 +641,7 @@ static void print_sym_table(void) percent_color_fprintf(stdout, "%4.1f%%", pcnt); if (verbose) - printf(" %016llx", sym->start); + printf(" %016" PRIx64, sym->start); printf(" %-*.*s", sym_width, sym_width, sym->name); printf(" %-*.*s\n", dso_width, dso_width, dso_width >= syme->map->dso->long_name_len ? diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c index 2302ec051bb..1478ab4ee22 100644 --- a/tools/perf/util/event.c +++ b/tools/perf/util/event.c @@ -459,7 +459,8 @@ int event__process_comm(event_t *self, struct sample_data *sample __used, int event__process_lost(event_t *self, struct sample_data *sample __used, struct perf_session *session) { - dump_printf(": id:%Ld: lost:%Ld\n", self->lost.id, self->lost.lost); + dump_printf(": id:%" PRIu64 ": lost:%" PRIu64 "\n", + self->lost.id, self->lost.lost); session->hists.stats.total_lost += self->lost.lost; return 0; } @@ -575,7 +576,7 @@ int event__process_mmap(event_t *self, struct sample_data *sample __used, u8 cpumode = self->header.misc & PERF_RECORD_MISC_CPUMODE_MASK; int ret = 0; - dump_printf(" %d/%d: [%#Lx(%#Lx) @ %#Lx]: %s\n", + dump_printf(" %d/%d: [%#" PRIx64 "(%#" PRIx64 ") @ %#" PRIx64 "]: %s\n", self->mmap.pid, self->mmap.tid, self->mmap.start, self->mmap.len, self->mmap.pgoff, self->mmap.filename); diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index 989fa2dee2f..f6a929e7498 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -798,8 +798,8 @@ static int perf_file_section__process(struct perf_file_section *self, int feat, int fd) { if (lseek(fd, self->offset, SEEK_SET) == (off_t)-1) { - pr_debug("Failed to lseek to %Ld offset for feature %d, " - "continuing...\n", self->offset, feat); + pr_debug("Failed to lseek to %" PRIu64 " offset for feature " + "%d, continuing...\n", self->offset, feat); return 0; } diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c index c749ba6136a..32f4f1f2f6e 100644 --- a/tools/perf/util/hist.c +++ b/tools/perf/util/hist.c @@ -636,13 +636,13 @@ int hist_entry__snprintf(struct hist_entry *self, char *s, size_t size, } } } else - ret = snprintf(s, size, sep ? "%lld" : "%12lld ", period); + ret = snprintf(s, size, sep ? "%" PRIu64 : "%12" PRIu64 " ", period); if (symbol_conf.show_nr_samples) { if (sep) - ret += snprintf(s + ret, size - ret, "%c%lld", *sep, period); + ret += snprintf(s + ret, size - ret, "%c%" PRIu64, *sep, period); else - ret += snprintf(s + ret, size - ret, "%11lld", period); + ret += snprintf(s + ret, size - ret, "%11" PRIu64, period); } if (pair_hists) { @@ -971,7 +971,7 @@ int hist_entry__inc_addr_samples(struct hist_entry *self, u64 ip) sym_size = sym->end - sym->start; offset = ip - sym->start; - pr_debug3("%s: ip=%#Lx\n", __func__, self->ms.map->unmap_ip(self->ms.map, ip)); + pr_debug3("%s: ip=%#" PRIx64 "\n", __func__, self->ms.map->unmap_ip(self->ms.map, ip)); if (offset >= sym_size) return 0; @@ -980,8 +980,9 @@ int hist_entry__inc_addr_samples(struct hist_entry *self, u64 ip) h->sum++; h->ip[offset]++; - pr_debug3("%#Lx %s: period++ [ip: %#Lx, %#Lx] => %Ld\n", self->ms.sym->start, - self->ms.sym->name, ip, ip - self->ms.sym->start, h->ip[offset]); + pr_debug3("%#" PRIx64 " %s: period++ [ip: %#" PRIx64 ", %#" PRIx64 + "] => %" PRIu64 "\n", self->ms.sym->start, self->ms.sym->name, + ip, ip - self->ms.sym->start, h->ip[offset]); return 0; } @@ -1132,7 +1133,7 @@ fallback: goto out_free_filename; } - pr_debug("%s: filename=%s, sym=%s, start=%#Lx, end=%#Lx\n", __func__, + pr_debug("%s: filename=%s, sym=%s, start=%#" PRIx64 ", end=%#" PRIx64 "\n", __func__, filename, sym->name, map->unmap_ip(map, sym->start), map->unmap_ip(map, sym->end)); @@ -1142,7 +1143,7 @@ fallback: dso, dso->long_name, sym, sym->name); snprintf(command, sizeof(command), - "objdump --start-address=0x%016Lx --stop-address=0x%016Lx -dS -C %s|grep -v %s|expand", + "objdump --start-address=0x%016" PRIx64 " --stop-address=0x%016" PRIx64 " -dS -C %s|grep -v %s|expand", map__rip_2objdump(map, sym->start), map__rip_2objdump(map, sym->end), symfs_filename, filename); diff --git a/tools/perf/util/include/linux/bitops.h b/tools/perf/util/include/linux/bitops.h index 8be0b968ca0..305c8484f20 100644 --- a/tools/perf/util/include/linux/bitops.h +++ b/tools/perf/util/include/linux/bitops.h @@ -2,6 +2,7 @@ #define _PERF_LINUX_BITOPS_H_ #include <linux/kernel.h> +#include <linux/compiler.h> #include <asm/hweight.h> #define BITS_PER_LONG __WORDSIZE diff --git a/tools/perf/util/map.c b/tools/perf/util/map.c index 3a7eb6ec0ee..a16ecab5229 100644 --- a/tools/perf/util/map.c +++ b/tools/perf/util/map.c @@ -1,5 +1,6 @@ #include "symbol.h" #include <errno.h> +#include <inttypes.h> #include <limits.h> #include <stdlib.h> #include <string.h> @@ -195,7 +196,7 @@ int map__overlap(struct map *l, struct map *r) size_t map__fprintf(struct map *self, FILE *fp) { - return fprintf(fp, " %Lx-%Lx %Lx %s\n", + return fprintf(fp, " %" PRIx64 "-%" PRIx64 " %" PRIx64 " %s\n", self->start, self->end, self->pgoff, self->dso->name); } diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index bc2732ee23e..135f69baf96 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -279,7 +279,7 @@ const char *__event_name(int type, u64 config) static char buf[32]; if (type == PERF_TYPE_RAW) { - sprintf(buf, "raw 0x%llx", config); + sprintf(buf, "raw 0x%" PRIx64, config); return buf; } diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h index b82cafb8377..458e3ecf17a 100644 --- a/tools/perf/util/parse-events.h +++ b/tools/perf/util/parse-events.h @@ -23,7 +23,7 @@ struct tracepoint_path { }; extern struct tracepoint_path *tracepoint_id_to_path(u64 config); -extern bool have_tracepoints(struct list_head *evsel_list); +extern bool have_tracepoints(struct list_head *evlist); extern int nr_counters; diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c index 128aaab0aed..6e29d9c9dcc 100644 --- a/tools/perf/util/probe-event.c +++ b/tools/perf/util/probe-event.c @@ -172,7 +172,7 @@ static int kprobe_convert_to_perf_probe(struct probe_trace_point *tp, sym = __find_kernel_function_by_name(tp->symbol, &map); if (sym) { addr = map->unmap_ip(map, sym->start + tp->offset); - pr_debug("try to find %s+%ld@%llx\n", tp->symbol, + pr_debug("try to find %s+%ld@%" PRIx64 "\n", tp->symbol, tp->offset, addr); ret = find_perf_probe_point((unsigned long)addr, pp); } diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index 313dac2d94c..105f00bfd55 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -652,10 +652,11 @@ static void callchain__printf(struct sample_data *sample) { unsigned int i; - printf("... chain: nr:%Lu\n", sample->callchain->nr); + printf("... chain: nr:%" PRIu64 "\n", sample->callchain->nr); for (i = 0; i < sample->callchain->nr; i++) - printf("..... %2d: %016Lx\n", i, sample->callchain->ips[i]); + printf("..... %2d: %016" PRIx64 "\n", + i, sample->callchain->ips[i]); } static void perf_session__print_tstamp(struct perf_session *session, @@ -672,7 +673,7 @@ static void perf_session__print_tstamp(struct perf_session *session, printf("%u ", sample->cpu); if (session->sample_type & PERF_SAMPLE_TIME) - printf("%Lu ", sample->time); + printf("%" PRIu64 " ", sample->time); } static void dump_event(struct perf_session *session, event_t *event, @@ -681,16 +682,16 @@ static void dump_event(struct perf_session *session, event_t *event, if (!dump_trace) return; - printf("\n%#Lx [%#x]: event: %d\n", file_offset, event->header.size, - event->header.type); + printf("\n%#" PRIx64 " [%#x]: event: %d\n", + file_offset, event->header.size, event->header.type); trace_event(event); if (sample) perf_session__print_tstamp(session, event, sample); - printf("%#Lx [%#x]: PERF_RECORD_%s", file_offset, event->header.size, - event__get_event_name(event->header.type)); + printf("%#" PRIx64 " [%#x]: PERF_RECORD_%s", file_offset, + event->header.size, event__get_event_name(event->header.type)); } static void dump_sample(struct perf_session *session, event_t *event, @@ -699,8 +700,9 @@ static void dump_sample(struct perf_session *session, event_t *event, if (!dump_trace) return; - printf("(IP, %d): %d/%d: %#Lx period: %Ld\n", event->header.misc, - sample->pid, sample->tid, sample->ip, sample->period); + printf("(IP, %d): %d/%d: %#" PRIx64 " period: %" PRIu64 "\n", + event->header.misc, sample->pid, sample->tid, sample->ip, + sample->period); if (session->sample_type & PERF_SAMPLE_CALLCHAIN) callchain__printf(sample); @@ -843,8 +845,8 @@ static void perf_session__warn_about_errors(const struct perf_session *session, { if (ops->lost == event__process_lost && session->hists.stats.total_lost != 0) { - ui__warning("Processed %Lu events and LOST %Lu!\n\n" - "Check IO/CPU overload!\n\n", + ui__warning("Processed %" PRIu64 " events and LOST %" PRIu64 + "!\n\nCheck IO/CPU overload!\n\n", session->hists.stats.total_period, session->hists.stats.total_lost); } @@ -918,7 +920,7 @@ more: if (size == 0 || (skip = perf_session__process_event(self, &event, ops, head)) < 0) { - dump_printf("%#Lx [%#x]: skipping unknown header type: %d\n", + dump_printf("%#" PRIx64 " [%#x]: skipping unknown header type: %d\n", head, event.header.size, event.header.type); /* * assume we lost track of the stream, check alignment, and @@ -1023,7 +1025,7 @@ more: if (size == 0 || perf_session__process_event(session, event, ops, file_pos) < 0) { - dump_printf("%#Lx [%#x]: skipping unknown header type: %d\n", + dump_printf("%#" PRIx64 " [%#x]: skipping unknown header type: %d\n", file_offset + head, event->header.size, event->header.type); /* diff --git a/tools/perf/util/svghelper.c b/tools/perf/util/svghelper.c index b3637db025a..fb737fe9be9 100644 --- a/tools/perf/util/svghelper.c +++ b/tools/perf/util/svghelper.c @@ -12,6 +12,7 @@ * of the License. */ +#include <inttypes.h> #include <stdio.h> #include <stdlib.h> #include <unistd.h> @@ -43,11 +44,11 @@ static double cpu2y(int cpu) return cpu2slot(cpu) * SLOT_MULT; } -static double time2pixels(u64 time) +static double time2pixels(u64 __time) { double X; - X = 1.0 * svg_page_width * (time - first_time) / (last_time - first_time); + X = 1.0 * svg_page_width * (__time - first_time) / (last_time - first_time); return X; } @@ -94,7 +95,7 @@ void open_svg(const char *filename, int cpus, int rows, u64 start, u64 end) total_height = (1 + rows + cpu2slot(cpus)) * SLOT_MULT; fprintf(svgfile, "<?xml version=\"1.0\" standalone=\"no\"?> \n"); - fprintf(svgfile, "<svg width=\"%i\" height=\"%llu\" version=\"1.1\" xmlns=\"http://www.w3.org/2000/svg\">\n", svg_page_width, total_height); + fprintf(svgfile, "<svg width=\"%i\" height=\"%" PRIu64 "\" version=\"1.1\" xmlns=\"http://www.w3.org/2000/svg\">\n", svg_page_width, total_height); fprintf(svgfile, "<defs>\n <style type=\"text/css\">\n <![CDATA[\n"); @@ -483,7 +484,7 @@ void svg_time_grid(void) color = 128; } - fprintf(svgfile, "<line x1=\"%4.8f\" y1=\"%4.2f\" x2=\"%4.8f\" y2=\"%llu\" style=\"stroke:rgb(%i,%i,%i);stroke-width:%1.3f\"/>\n", + fprintf(svgfile, "<line x1=\"%4.8f\" y1=\"%4.2f\" x2=\"%4.8f\" y2=\"%" PRIu64 "\" style=\"stroke:rgb(%i,%i,%i);stroke-width:%1.3f\"/>\n", time2pixels(i), SLOT_MULT/2, time2pixels(i), total_height, color, color, color, thickness); i += 10000000; diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c index 15ccfba8cdf..7821d0e6866 100644 --- a/tools/perf/util/symbol.c +++ b/tools/perf/util/symbol.c @@ -11,6 +11,7 @@ #include <sys/param.h> #include <fcntl.h> #include <unistd.h> +#include <inttypes.h> #include "build-id.h" #include "debug.h" #include "symbol.h" @@ -153,7 +154,7 @@ static struct symbol *symbol__new(u64 start, u64 len, u8 binding, self->binding = binding; self->namelen = namelen - 1; - pr_debug4("%s: %s %#Lx-%#Lx\n", __func__, name, start, self->end); + pr_debug4("%s: %s %#" PRIx64 "-%#" PRIx64 "\n", __func__, name, start, self->end); memcpy(self->name, name, namelen); @@ -167,7 +168,7 @@ void symbol__delete(struct symbol *self) static size_t symbol__fprintf(struct symbol *self, FILE *fp) { - return fprintf(fp, " %llx-%llx %c %s\n", + return fprintf(fp, " %" PRIx64 "-%" PRIx64 " %c %s\n", self->start, self->end, self->binding == STB_GLOBAL ? 'g' : self->binding == STB_LOCAL ? 'l' : 'w', @@ -1161,6 +1162,13 @@ static int dso__load_sym(struct dso *self, struct map *map, const char *name, section_name = elf_sec__name(&shdr, secstrs); + /* On ARM, symbols for thumb functions have 1 added to + * the symbol address as a flag - remove it */ + if ((ehdr.e_machine == EM_ARM) && + (map->type == MAP__FUNCTION) && + (sym.st_value & 1)) + --sym.st_value; + if (self->kernel != DSO_TYPE_USER || kmodule) { char dso_name[PATH_MAX]; @@ -1208,8 +1216,8 @@ static int dso__load_sym(struct dso *self, struct map *map, const char *name, } if (curr_dso->adjust_symbols) { - pr_debug4("%s: adjusting symbol: st_value: %#Lx " - "sh_addr: %#Lx sh_offset: %#Lx\n", __func__, + pr_debug4("%s: adjusting symbol: st_value: %#" PRIx64 " " + "sh_addr: %#" PRIx64 " sh_offset: %#" PRIx64 "\n", __func__, (u64)sym.st_value, (u64)shdr.sh_addr, (u64)shdr.sh_offset); sym.st_value -= shdr.sh_addr - shdr.sh_offset; diff --git a/tools/perf/util/types.h b/tools/perf/util/types.h index 7d6b8331f89..5f3689a3d08 100644 --- a/tools/perf/util/types.h +++ b/tools/perf/util/types.h @@ -1,12 +1,14 @@ #ifndef __PERF_TYPES_H #define __PERF_TYPES_H +#include <stdint.h> + /* - * We define u64 as unsigned long long for every architecture - * so that we can print it with %Lx without getting warnings. + * We define u64 as uint64_t for every architecture + * so that we can print it with "%"PRIx64 without getting warnings. */ -typedef unsigned long long u64; -typedef signed long long s64; +typedef uint64_t u64; +typedef int64_t s64; typedef unsigned int u32; typedef signed int s32; typedef unsigned short u16; diff --git a/tools/perf/util/ui/browsers/hists.c b/tools/perf/util/ui/browsers/hists.c index ebda8c3fde9..60c463c1602 100644 --- a/tools/perf/util/ui/browsers/hists.c +++ b/tools/perf/util/ui/browsers/hists.c @@ -350,7 +350,7 @@ static char *callchain_list__sym_name(struct callchain_list *self, if (self->ms.sym) return self->ms.sym->name; - snprintf(bf, bfsize, "%#Lx", self->ip); + snprintf(bf, bfsize, "%#" PRIx64, self->ip); return bf; } diff --git a/tools/perf/util/ui/browsers/map.c b/tools/perf/util/ui/browsers/map.c index e35437dfa5b..e5158369106 100644 --- a/tools/perf/util/ui/browsers/map.c +++ b/tools/perf/util/ui/browsers/map.c @@ -1,5 +1,6 @@ #include "../libslang.h" #include <elf.h> +#include <inttypes.h> #include <sys/ttydefaults.h> #include <ctype.h> #include <string.h> @@ -57,7 +58,7 @@ static void map_browser__write(struct ui_browser *self, void *nd, int row) int width; ui_browser__set_percent_color(self, 0, current_entry); - slsmg_printf("%*llx %*llx %c ", + slsmg_printf("%*" PRIx64 " %*" PRIx64 " %c ", mb->addrlen, sym->start, mb->addrlen, sym->end, sym->binding == STB_GLOBAL ? 'g' : sym->binding == STB_LOCAL ? 'l' : 'w'); @@ -150,6 +151,6 @@ int map__browse(struct map *self) ++mb.b.nr_entries; } - mb.addrlen = snprintf(tmp, sizeof(tmp), "%llx", maxaddr); + mb.addrlen = snprintf(tmp, sizeof(tmp), "%" PRIx64, maxaddr); return map_browser__run(&mb); } diff --git a/tools/perf/util/values.c b/tools/perf/util/values.c index cfa55d686e3..bdd33470b23 100644 --- a/tools/perf/util/values.c +++ b/tools/perf/util/values.c @@ -150,7 +150,7 @@ static void perf_read_values__display_pretty(FILE *fp, if (width > tidwidth) tidwidth = width; for (j = 0; j < values->counters; j++) { - width = snprintf(NULL, 0, "%Lu", values->value[i][j]); + width = snprintf(NULL, 0, "%" PRIu64, values->value[i][j]); if (width > counterwidth[j]) counterwidth[j] = width; } @@ -165,7 +165,7 @@ static void perf_read_values__display_pretty(FILE *fp, fprintf(fp, " %*d %*d", pidwidth, values->pid[i], tidwidth, values->tid[i]); for (j = 0; j < values->counters; j++) - fprintf(fp, " %*Lu", + fprintf(fp, " %*" PRIu64, counterwidth[j], values->value[i][j]); fprintf(fp, "\n"); } @@ -196,13 +196,13 @@ static void perf_read_values__display_raw(FILE *fp, width = strlen(values->countername[j]); if (width > namewidth) namewidth = width; - width = snprintf(NULL, 0, "%llx", values->counterrawid[j]); + width = snprintf(NULL, 0, "%" PRIx64, values->counterrawid[j]); if (width > rawwidth) rawwidth = width; } for (i = 0; i < values->threads; i++) { for (j = 0; j < values->counters; j++) { - width = snprintf(NULL, 0, "%Lu", values->value[i][j]); + width = snprintf(NULL, 0, "%" PRIu64, values->value[i][j]); if (width > countwidth) countwidth = width; } @@ -214,7 +214,7 @@ static void perf_read_values__display_raw(FILE *fp, countwidth, "Count"); for (i = 0; i < values->threads; i++) for (j = 0; j < values->counters; j++) - fprintf(fp, " %*d %*d %*s %*llx %*Lu\n", + fprintf(fp, " %*d %*d %*s %*" PRIx64 " %*" PRIu64, pidwidth, values->pid[i], tidwidth, values->tid[i], namewidth, values->countername[j], |