generic: 5.15: drop upstream patch

Drop upstream patch from backport dir for kernel 5.15

Signed-off-by: Ansuel Smith <ansuelsmth@gmail.com>
This commit is contained in:
Ansuel Smith 2021-11-04 23:21:11 +01:00 committed by Daniel Golle
parent 9a038e7fd1
commit 79dfa44733
188 changed files with 0 additions and 26893 deletions

View File

@ -1,30 +0,0 @@
From 13b1ecc3401653a355798eb1dee10cc1608202f4 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Mon, 18 Jan 2016 12:27:49 +0100
Subject: [PATCH 33/34] Kbuild: don't hardcode path to awk in
scripts/ld-version.sh
On some systems /usr/bin/awk does not exist, or is broken. Find it via
$PATH instead.
Signed-off-by: Felix Fietkau <nbd@nbd.name>
---
scripts/ld-version.sh | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
--- a/scripts/ld-version.sh
+++ b/scripts/ld-version.sh
@@ -1,6 +1,7 @@
-#!/usr/bin/awk -f
+#!/bin/sh
# SPDX-License-Identifier: GPL-2.0
# extract linker version number from stdin and turn into single number
+exec awk '
{
gsub(".*\\)", "");
gsub(".*version ", "");
@@ -9,3 +10,4 @@
print a[1]*100000000 + a[2]*1000000 + a[3]*10000;
exit
}
+'

View File

@ -1,27 +0,0 @@
From 1027a42c25cbf8cfc4ade6503c5110aae04866af Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniel=20Gonz=C3=A1lez=20Cabanelas?= <dgcbueu@gmail.com>
Date: Fri, 16 Oct 2020 20:22:37 +0200
Subject: [PATCH] power: reset: linkstation-poweroff: add missing put_device()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
The of_mdio_find_bus() takes a reference to the underlying device
structure, we should release that reference using a put_device() call.
Signed-off-by: Daniel González Cabanelas <dgcbueu@gmail.com>
Signed-off-by: Sebastian Reichel <sre@kernel.org>
---
drivers/power/reset/linkstation-poweroff.c | 1 +
1 file changed, 1 insertion(+)
--- a/drivers/power/reset/linkstation-poweroff.c
+++ b/drivers/power/reset/linkstation-poweroff.c
@@ -113,6 +113,7 @@ static int __init linkstation_poweroff_i
return -EPROBE_DEFER;
phydev = phy_find_first(bus);
+ put_device(&bus->dev);
if (!phydev)
return -EPROBE_DEFER;

View File

@ -1,272 +0,0 @@
From 03662fcd41f4b764857f17b95f9a2a63c24bddd4 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Tue, 3 Nov 2020 17:28:09 +0100
Subject: [PATCH 1/2] crypto: arm/chacha-neon - optimize for non-block size
multiples
commit 86cd97ec4b943af35562a74688bc4e909b32c3d1 upstream.
The current NEON based ChaCha implementation for ARM is optimized for
multiples of 4x the ChaCha block size (64 bytes). This makes sense for
block encryption, but given that ChaCha is also often used in the
context of networking, it makes sense to consider arbitrary length
inputs as well.
For example, WireGuard typically uses 1420 byte packets, and performing
ChaCha encryption involves 5 invocations of chacha_4block_xor_neon()
and 3 invocations of chacha_block_xor_neon(), where the last one also
involves a memcpy() using a buffer on the stack to process the final
chunk of 1420 % 64 == 12 bytes.
Let's optimize for this case as well, by letting chacha_4block_xor_neon()
deal with any input size between 64 and 256 bytes, using NEON permutation
instructions and overlapping loads and stores. This way, the 140 byte
tail of a 1420 byte input buffer can simply be processed in one go.
This results in the following performance improvements for 1420 byte
blocks, without significant impact on power-of-2 input sizes. (Note
that Raspberry Pi is widely used in combination with a 32-bit kernel,
even though the core is 64-bit capable)
Cortex-A8 (BeagleBone) : 7%
Cortex-A15 (Calxeda Midway) : 21%
Cortex-A53 (Raspberry Pi 3) : 3%
Cortex-A72 (Raspberry Pi 4) : 19%
Cc: Eric Biggers <ebiggers@google.com>
Cc: "Jason A . Donenfeld" <Jason@zx2c4.com>
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
arch/arm/crypto/chacha-glue.c | 34 +++++------
arch/arm/crypto/chacha-neon-core.S | 97 +++++++++++++++++++++++++++---
2 files changed, 107 insertions(+), 24 deletions(-)
--- a/arch/arm/crypto/chacha-glue.c
+++ b/arch/arm/crypto/chacha-glue.c
@@ -23,7 +23,7 @@
asmlinkage void chacha_block_xor_neon(const u32 *state, u8 *dst, const u8 *src,
int nrounds);
asmlinkage void chacha_4block_xor_neon(const u32 *state, u8 *dst, const u8 *src,
- int nrounds);
+ int nrounds, unsigned int nbytes);
asmlinkage void hchacha_block_arm(const u32 *state, u32 *out, int nrounds);
asmlinkage void hchacha_block_neon(const u32 *state, u32 *out, int nrounds);
@@ -42,24 +42,24 @@ static void chacha_doneon(u32 *state, u8
{
u8 buf[CHACHA_BLOCK_SIZE];
- while (bytes >= CHACHA_BLOCK_SIZE * 4) {
- chacha_4block_xor_neon(state, dst, src, nrounds);
- bytes -= CHACHA_BLOCK_SIZE * 4;
- src += CHACHA_BLOCK_SIZE * 4;
- dst += CHACHA_BLOCK_SIZE * 4;
- state[12] += 4;
- }
- while (bytes >= CHACHA_BLOCK_SIZE) {
- chacha_block_xor_neon(state, dst, src, nrounds);
- bytes -= CHACHA_BLOCK_SIZE;
- src += CHACHA_BLOCK_SIZE;
- dst += CHACHA_BLOCK_SIZE;
- state[12]++;
+ while (bytes > CHACHA_BLOCK_SIZE) {
+ unsigned int l = min(bytes, CHACHA_BLOCK_SIZE * 4U);
+
+ chacha_4block_xor_neon(state, dst, src, nrounds, l);
+ bytes -= l;
+ src += l;
+ dst += l;
+ state[12] += DIV_ROUND_UP(l, CHACHA_BLOCK_SIZE);
}
if (bytes) {
- memcpy(buf, src, bytes);
- chacha_block_xor_neon(state, buf, buf, nrounds);
- memcpy(dst, buf, bytes);
+ const u8 *s = src;
+ u8 *d = dst;
+
+ if (bytes != CHACHA_BLOCK_SIZE)
+ s = d = memcpy(buf, src, bytes);
+ chacha_block_xor_neon(state, d, s, nrounds);
+ if (d != dst)
+ memcpy(dst, buf, bytes);
}
}
--- a/arch/arm/crypto/chacha-neon-core.S
+++ b/arch/arm/crypto/chacha-neon-core.S
@@ -47,6 +47,7 @@
*/
#include <linux/linkage.h>
+#include <asm/cache.h>
.text
.fpu neon
@@ -205,7 +206,7 @@ ENDPROC(hchacha_block_neon)
.align 5
ENTRY(chacha_4block_xor_neon)
- push {r4-r5}
+ push {r4, lr}
mov r4, sp // preserve the stack pointer
sub ip, sp, #0x20 // allocate a 32 byte buffer
bic ip, ip, #0x1f // aligned to 32 bytes
@@ -229,10 +230,10 @@ ENTRY(chacha_4block_xor_neon)
vld1.32 {q0-q1}, [r0]
vld1.32 {q2-q3}, [ip]
- adr r5, .Lctrinc
+ adr lr, .Lctrinc
vdup.32 q15, d7[1]
vdup.32 q14, d7[0]
- vld1.32 {q4}, [r5, :128]
+ vld1.32 {q4}, [lr, :128]
vdup.32 q13, d6[1]
vdup.32 q12, d6[0]
vdup.32 q11, d5[1]
@@ -455,7 +456,7 @@ ENTRY(chacha_4block_xor_neon)
// Re-interleave the words in the first two rows of each block (x0..7).
// Also add the counter values 0-3 to x12[0-3].
- vld1.32 {q8}, [r5, :128] // load counter values 0-3
+ vld1.32 {q8}, [lr, :128] // load counter values 0-3
vzip.32 q0, q1 // => (0 1 0 1) (0 1 0 1)
vzip.32 q2, q3 // => (2 3 2 3) (2 3 2 3)
vzip.32 q4, q5 // => (4 5 4 5) (4 5 4 5)
@@ -493,6 +494,8 @@ ENTRY(chacha_4block_xor_neon)
// Re-interleave the words in the last two rows of each block (x8..15).
vld1.32 {q8-q9}, [sp, :256]
+ mov sp, r4 // restore original stack pointer
+ ldr r4, [r4, #8] // load number of bytes
vzip.32 q12, q13 // => (12 13 12 13) (12 13 12 13)
vzip.32 q14, q15 // => (14 15 14 15) (14 15 14 15)
vzip.32 q8, q9 // => (8 9 8 9) (8 9 8 9)
@@ -520,41 +523,121 @@ ENTRY(chacha_4block_xor_neon)
// XOR the rest of the data with the keystream
vld1.8 {q0-q1}, [r2]!
+ subs r4, r4, #96
veor q0, q0, q8
veor q1, q1, q12
+ ble .Lle96
vst1.8 {q0-q1}, [r1]!
vld1.8 {q0-q1}, [r2]!
+ subs r4, r4, #32
veor q0, q0, q2
veor q1, q1, q6
+ ble .Lle128
vst1.8 {q0-q1}, [r1]!
vld1.8 {q0-q1}, [r2]!
+ subs r4, r4, #32
veor q0, q0, q10
veor q1, q1, q14
+ ble .Lle160
vst1.8 {q0-q1}, [r1]!
vld1.8 {q0-q1}, [r2]!
+ subs r4, r4, #32
veor q0, q0, q4
veor q1, q1, q5
+ ble .Lle192
vst1.8 {q0-q1}, [r1]!
vld1.8 {q0-q1}, [r2]!
+ subs r4, r4, #32
veor q0, q0, q9
veor q1, q1, q13
+ ble .Lle224
vst1.8 {q0-q1}, [r1]!
vld1.8 {q0-q1}, [r2]!
+ subs r4, r4, #32
veor q0, q0, q3
veor q1, q1, q7
+ blt .Llt256
+.Lout:
vst1.8 {q0-q1}, [r1]!
vld1.8 {q0-q1}, [r2]
- mov sp, r4 // restore original stack pointer
veor q0, q0, q11
veor q1, q1, q15
vst1.8 {q0-q1}, [r1]
- pop {r4-r5}
- bx lr
+ pop {r4, pc}
+
+.Lle192:
+ vmov q4, q9
+ vmov q5, q13
+
+.Lle160:
+ // nothing to do
+
+.Lfinalblock:
+ // Process the final block if processing less than 4 full blocks.
+ // Entered with 32 bytes of ChaCha cipher stream in q4-q5, and the
+ // previous 32 byte output block that still needs to be written at
+ // [r1] in q0-q1.
+ beq .Lfullblock
+
+.Lpartialblock:
+ adr lr, .Lpermute + 32
+ add r2, r2, r4
+ add lr, lr, r4
+ add r4, r4, r1
+
+ vld1.8 {q2-q3}, [lr]
+ vld1.8 {q6-q7}, [r2]
+
+ add r4, r4, #32
+
+ vtbl.8 d4, {q4-q5}, d4
+ vtbl.8 d5, {q4-q5}, d5
+ vtbl.8 d6, {q4-q5}, d6
+ vtbl.8 d7, {q4-q5}, d7
+
+ veor q6, q6, q2
+ veor q7, q7, q3
+
+ vst1.8 {q6-q7}, [r4] // overlapping stores
+ vst1.8 {q0-q1}, [r1]
+ pop {r4, pc}
+
+.Lfullblock:
+ vmov q11, q4
+ vmov q15, q5
+ b .Lout
+.Lle96:
+ vmov q4, q2
+ vmov q5, q6
+ b .Lfinalblock
+.Lle128:
+ vmov q4, q10
+ vmov q5, q14
+ b .Lfinalblock
+.Lle224:
+ vmov q4, q3
+ vmov q5, q7
+ b .Lfinalblock
+.Llt256:
+ vmov q4, q11
+ vmov q5, q15
+ b .Lpartialblock
ENDPROC(chacha_4block_xor_neon)
+
+ .align L1_CACHE_SHIFT
+.Lpermute:
+ .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
+ .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
+ .byte 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
+ .byte 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f
+ .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
+ .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
+ .byte 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
+ .byte 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f

View File

@ -1,38 +0,0 @@
From 7f63462faf9eab69132bea9abd48c2c05a93145b Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Sun, 13 Dec 2020 15:39:29 +0100
Subject: [PATCH 2/2] crypto: arm/chacha-neon - add missing counter increment
commit fd16931a2f518a32753920ff20895e5cf04c8ff1 upstream.
Commit 86cd97ec4b943af3 ("crypto: arm/chacha-neon - optimize for non-block
size multiples") refactored the chacha block handling in the glue code in
a way that may result in the counter increment to be omitted when calling
chacha_block_xor_neon() to process a full block. This violates the skcipher
API, which requires that the output IV is suitable for handling more input
as long as the preceding input has been presented in round multiples of the
block size. Also, the same code is exposed via the chacha library interface
whose callers may actually rely on this increment to occur even for final
blocks that are smaller than the chacha block size.
So increment the counter after calling chacha_block_xor_neon().
Fixes: 86cd97ec4b943af3 ("crypto: arm/chacha-neon - optimize for non-block size multiples")
Reported-by: Eric Biggers <ebiggers@kernel.org>
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
arch/arm/crypto/chacha-glue.c | 1 +
1 file changed, 1 insertion(+)
--- a/arch/arm/crypto/chacha-glue.c
+++ b/arch/arm/crypto/chacha-glue.c
@@ -60,6 +60,7 @@ static void chacha_doneon(u32 *state, u8
chacha_block_xor_neon(state, d, s, nrounds);
if (d != dst)
memcpy(dst, buf, bytes);
+ state[12]++;
}
}

View File

@ -1,42 +0,0 @@
From a13827e9091c07e25cdeec9a402d74a27e2a1111 Mon Sep 17 00:00:00 2001
From: "Jason A. Donenfeld" <Jason@zx2c4.com>
Date: Mon, 22 Feb 2021 17:25:46 +0100
Subject: [PATCH] wireguard: peer: put frequently used members above cache
lines
commit 5a0598695634a6bb4126818902dd9140cd9df8b6 upstream.
The is_dead boolean is checked for every single packet, while the
internal_id member is used basically only for pr_debug messages. So it
makes sense to hoist up is_dead into some space formerly unused by a
struct hole, while demoting internal_api to below the lowest struct
cache line.
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
drivers/net/wireguard/peer.h | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
--- a/drivers/net/wireguard/peer.h
+++ b/drivers/net/wireguard/peer.h
@@ -39,6 +39,7 @@ struct wg_peer {
struct prev_queue tx_queue, rx_queue;
struct sk_buff_head staged_packet_queue;
int serial_work_cpu;
+ bool is_dead;
struct noise_keypairs keypairs;
struct endpoint endpoint;
struct dst_cache endpoint_cache;
@@ -61,9 +62,8 @@ struct wg_peer {
struct rcu_head rcu;
struct list_head peer_list;
struct list_head allowedips_list;
- u64 internal_id;
struct napi_struct napi;
- bool is_dead;
+ u64 internal_id;
};
struct wg_peer *wg_peer_create(struct wg_device *wg,

View File

@ -1,36 +0,0 @@
From 6523061868212473f63812a0c477a161742bed42 Mon Sep 17 00:00:00 2001
From: "Jason A. Donenfeld" <Jason@zx2c4.com>
Date: Sat, 27 Feb 2021 13:20:24 +0100
Subject: [PATCH] MIPS: select CPU_MIPS64 for remaining MIPS64 CPUs
The CPU_MIPS64 and CPU_MIPS32 variables are supposed to be able to
distinguish broadly between 64-bit and 32-bit MIPS CPUs. However, they
weren't selected by the specialty CPUs, Octeon and Loongson, which meant
it was possible to hit a weird state of:
MIPS=y, CONFIG_64BIT=y, CPU_MIPS64=n
This commit rectifies the issue by having CPU_MIPS64 be selected when
the missing Octeon or Loongson models are selected.
Cc: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: George Cherian <gcherian@marvell.com>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Jiaxun Yang <jiaxun.yang@flygoat.com>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
arch/mips/Kconfig | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -2088,7 +2088,7 @@ config CPU_MIPS32
config CPU_MIPS64
bool
default y if CPU_MIPS64_R1 || CPU_MIPS64_R2 || CPU_MIPS64_R5 || \
- CPU_MIPS64_R6
+ CPU_MIPS64_R6 || CPU_LOONGSON64 || CPU_CAVIUM_OCTEON
#
# These indicate the revision of the architecture

View File

@ -1,36 +0,0 @@
From 7d1531c81c0fb4c93bea8dc316043ad0e4d0c270 Mon Sep 17 00:00:00 2001
From: Chuanhong Guo <gch981213@gmail.com>
Date: Sun, 25 Oct 2020 23:19:40 +0800
Subject: [PATCH] MIPS: zboot: put appended dtb into a section
This will make a separated section for dtb appear in ELF, and we can
then use objcopy to patch a dtb into vmlinuz when RAW_APPENDED_DTB
is set in kernel config.
command to patch a dtb:
objcopy --set-section-flags=.appended_dtb=alloc,contents \
--update-section=.appended_dtb=<target>.dtb vmlinuz vmlinuz-dtb
Signed-off-by: Chuanhong Guo <gch981213@gmail.com>
---
arch/mips/boot/compressed/ld.script | 9 ++++++---
1 file changed, 6 insertions(+), 3 deletions(-)
--- a/arch/mips/boot/compressed/ld.script
+++ b/arch/mips/boot/compressed/ld.script
@@ -31,9 +31,12 @@ SECTIONS
CONSTRUCTORS
. = ALIGN(16);
}
- __appended_dtb = .;
- /* leave space for appended DTB */
- . += 0x100000;
+
+ .appended_dtb : {
+ __appended_dtb = .;
+ /* leave space for appended DTB */
+ . += 0x100000;
+ }
_edata = .;
/* End of data section */

View File

@ -1,324 +0,0 @@
From 04e9ab75267489224364fa510a88ada83e11c325 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= <rafal@milecki.pl>
Date: Thu, 10 Dec 2020 18:23:52 +0100
Subject: [PATCH] dt-bindings: mtd: convert "fixed-partitions" to the
json-schema
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
This standardizes its documentation, allows validating with Makefile
checks and helps writing DTS files.
Noticeable changes:
1. Dropped "Partitions can be represented by sub-nodes of a flash
device." as we also support subpartitions (don't have to be part of
flash device node)
2. Dropped "to Linux" as bindings are meant to be os agnostic.
Signed-off-by: Rafał Miłecki <rafal@milecki.pl>
Link: https://lore.kernel.org/r/20201210172352.31632-1-zajec5@gmail.com
Signed-off-by: Rob Herring <robh@kernel.org>
---
.../devicetree/bindings/mtd/partition.txt | 131 +--------------
.../mtd/partitions/fixed-partitions.yaml | 152 ++++++++++++++++++
2 files changed, 154 insertions(+), 129 deletions(-)
create mode 100644 Documentation/devicetree/bindings/mtd/partitions/fixed-partitions.yaml
--- a/Documentation/devicetree/bindings/mtd/partition.txt
+++ b/Documentation/devicetree/bindings/mtd/partition.txt
@@ -24,137 +24,10 @@ another partitioning method.
Available bindings are listed in the "partitions" subdirectory.
-Fixed Partitions
-================
-
-Partitions can be represented by sub-nodes of a flash device. This can be used
-on platforms which have strong conventions about which portions of a flash are
-used for what purposes, but which don't use an on-flash partition table such
-as RedBoot.
-
-The partition table should be a subnode of the flash node and should be named
-'partitions'. This node should have the following property:
-- compatible : (required) must be "fixed-partitions"
-Partitions are then defined in subnodes of the partitions node.
+Deprecated: partitions defined in flash node
+============================================
For backwards compatibility partitions as direct subnodes of the flash device are
supported. This use is discouraged.
NOTE: also for backwards compatibility, direct subnodes that have a compatible
string are not considered partitions, as they may be used for other bindings.
-
-#address-cells & #size-cells must both be present in the partitions subnode of the
-flash device. There are two valid values for both:
-<1>: for partitions that require a single 32-bit cell to represent their
- size/address (aka the value is below 4 GiB)
-<2>: for partitions that require two 32-bit cells to represent their
- size/address (aka the value is 4 GiB or greater).
-
-Required properties:
-- reg : The partition's offset and size within the flash
-
-Optional properties:
-- label : The label / name for this partition. If omitted, the label is taken
- from the node name (excluding the unit address).
-- read-only : This parameter, if present, is a hint to Linux that this
- partition should only be mounted read-only. This is usually used for flash
- partitions containing early-boot firmware images or data which should not be
- clobbered.
-- lock : Do not unlock the partition at initialization time (not supported on
- all devices)
-- slc-mode: This parameter, if present, allows one to emulate SLC mode on a
- partition attached to an MLC NAND thus making this partition immune to
- paired-pages corruptions
-
-Examples:
-
-
-flash@0 {
- partitions {
- compatible = "fixed-partitions";
- #address-cells = <1>;
- #size-cells = <1>;
-
- partition@0 {
- label = "u-boot";
- reg = <0x0000000 0x100000>;
- read-only;
- };
-
- uimage@100000 {
- reg = <0x0100000 0x200000>;
- };
- };
-};
-
-flash@1 {
- partitions {
- compatible = "fixed-partitions";
- #address-cells = <1>;
- #size-cells = <2>;
-
- /* a 4 GiB partition */
- partition@0 {
- label = "filesystem";
- reg = <0x00000000 0x1 0x00000000>;
- };
- };
-};
-
-flash@2 {
- partitions {
- compatible = "fixed-partitions";
- #address-cells = <2>;
- #size-cells = <2>;
-
- /* an 8 GiB partition */
- partition@0 {
- label = "filesystem #1";
- reg = <0x0 0x00000000 0x2 0x00000000>;
- };
-
- /* a 4 GiB partition */
- partition@200000000 {
- label = "filesystem #2";
- reg = <0x2 0x00000000 0x1 0x00000000>;
- };
- };
-};
-
-flash@3 {
- partitions {
- compatible = "fixed-partitions";
- #address-cells = <1>;
- #size-cells = <1>;
-
- partition@0 {
- label = "bootloader";
- reg = <0x000000 0x100000>;
- read-only;
- };
-
- firmware@100000 {
- label = "firmware";
- reg = <0x100000 0xe00000>;
- compatible = "brcm,trx";
- };
-
- calibration@f00000 {
- label = "calibration";
- reg = <0xf00000 0x100000>;
- compatible = "fixed-partitions";
- ranges = <0 0xf00000 0x100000>;
- #address-cells = <1>;
- #size-cells = <1>;
-
- partition@0 {
- label = "wifi0";
- reg = <0x000000 0x080000>;
- };
-
- partition@80000 {
- label = "wifi1";
- reg = <0x080000 0x080000>;
- };
- };
- };
-};
--- /dev/null
+++ b/Documentation/devicetree/bindings/mtd/partitions/fixed-partitions.yaml
@@ -0,0 +1,152 @@
+# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/mtd/partitions/fixed-partitions.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Fixed partitions
+
+description: |
+ This binding can be used on platforms which have strong conventions about
+ which portions of a flash are used for what purposes, but which don't use an
+ on-flash partition table such as RedBoot.
+
+ The partition table should be a node named "partitions". Partitions are then
+ defined as subnodes.
+
+maintainers:
+ - Rafał Miłecki <rafal@milecki.pl>
+
+properties:
+ compatible:
+ const: fixed-partitions
+
+ "#address-cells": true
+
+ "#size-cells": true
+
+patternProperties:
+ "@[0-9a-f]+$":
+ description: node describing a single flash partition
+ type: object
+
+ properties:
+ reg:
+ description: partition's offset and size within the flash
+ maxItems: 1
+
+ label:
+ description: The label / name for this partition. If omitted, the label
+ is taken from the node name (excluding the unit address).
+
+ read-only:
+ description: This parameter, if present, is a hint that this partition
+ should only be mounted read-only. This is usually used for flash
+ partitions containing early-boot firmware images or data which should
+ not be clobbered.
+ type: boolean
+
+ lock:
+ description: Do not unlock the partition at initialization time (not
+ supported on all devices)
+ type: boolean
+
+ slc-mode:
+ description: This parameter, if present, allows one to emulate SLC mode
+ on a partition attached to an MLC NAND thus making this partition
+ immune to paired-pages corruptions
+ type: boolean
+
+ required:
+ - reg
+
+required:
+ - "#address-cells"
+ - "#size-cells"
+
+additionalProperties: true
+
+examples:
+ - |
+ partitions {
+ compatible = "fixed-partitions";
+ #address-cells = <1>;
+ #size-cells = <1>;
+
+ partition@0 {
+ label = "u-boot";
+ reg = <0x0000000 0x100000>;
+ read-only;
+ };
+
+ uimage@100000 {
+ reg = <0x0100000 0x200000>;
+ };
+ };
+ - |
+ partitions {
+ compatible = "fixed-partitions";
+ #address-cells = <1>;
+ #size-cells = <2>;
+
+ /* a 4 GiB partition */
+ partition@0 {
+ label = "filesystem";
+ reg = <0x00000000 0x1 0x00000000>;
+ };
+ };
+ - |
+ partitions {
+ compatible = "fixed-partitions";
+ #address-cells = <2>;
+ #size-cells = <2>;
+
+ /* an 8 GiB partition */
+ partition@0 {
+ label = "filesystem #1";
+ reg = <0x0 0x00000000 0x2 0x00000000>;
+ };
+
+ /* a 4 GiB partition */
+ partition@200000000 {
+ label = "filesystem #2";
+ reg = <0x2 0x00000000 0x1 0x00000000>;
+ };
+ };
+ - |
+ partitions {
+ compatible = "fixed-partitions";
+ #address-cells = <1>;
+ #size-cells = <1>;
+
+ partition@0 {
+ label = "bootloader";
+ reg = <0x000000 0x100000>;
+ read-only;
+ };
+
+ firmware@100000 {
+ compatible = "brcm,trx";
+ label = "firmware";
+ reg = <0x100000 0xe00000>;
+ };
+
+ calibration@f00000 {
+ compatible = "fixed-partitions";
+ label = "calibration";
+ reg = <0xf00000 0x100000>;
+ ranges = <0 0xf00000 0x100000>;
+ #address-cells = <1>;
+ #size-cells = <1>;
+
+ partition@0 {
+ label = "wifi0";
+ reg = <0x000000 0x080000>;
+ };
+
+ partition@80000 {
+ label = "wifi1";
+ reg = <0x080000 0x080000>;
+ };
+ };
+ };

View File

@ -1,115 +0,0 @@
From 6418522022c706fd867b00b2571edba48b8fa8c7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= <rafal@milecki.pl>
Date: Thu, 11 Feb 2021 23:04:25 +0100
Subject: [PATCH] dt-bindings: mtd: move partition binding to its own file
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Single partition binding is quite common and may be:
1. Used by multiple parsers
2. Extended for more specific cases
Move it to separated file to avoid code duplication.
Signed-off-by: Rafał Miłecki <rafal@milecki.pl>
Reviewed-by: Rob Herring <robh@kernel.org>
Signed-off-by: Richard Weinberger <richard@nod.at>
---
.../mtd/partitions/fixed-partitions.yaml | 33 +------------
.../bindings/mtd/partitions/partition.yaml | 47 +++++++++++++++++++
2 files changed, 48 insertions(+), 32 deletions(-)
create mode 100644 Documentation/devicetree/bindings/mtd/partitions/partition.yaml
--- a/Documentation/devicetree/bindings/mtd/partitions/fixed-partitions.yaml
+++ b/Documentation/devicetree/bindings/mtd/partitions/fixed-partitions.yaml
@@ -27,38 +27,7 @@ properties:
patternProperties:
"@[0-9a-f]+$":
- description: node describing a single flash partition
- type: object
-
- properties:
- reg:
- description: partition's offset and size within the flash
- maxItems: 1
-
- label:
- description: The label / name for this partition. If omitted, the label
- is taken from the node name (excluding the unit address).
-
- read-only:
- description: This parameter, if present, is a hint that this partition
- should only be mounted read-only. This is usually used for flash
- partitions containing early-boot firmware images or data which should
- not be clobbered.
- type: boolean
-
- lock:
- description: Do not unlock the partition at initialization time (not
- supported on all devices)
- type: boolean
-
- slc-mode:
- description: This parameter, if present, allows one to emulate SLC mode
- on a partition attached to an MLC NAND thus making this partition
- immune to paired-pages corruptions
- type: boolean
-
- required:
- - reg
+ $ref: "partition.yaml#"
required:
- "#address-cells"
--- /dev/null
+++ b/Documentation/devicetree/bindings/mtd/partitions/partition.yaml
@@ -0,0 +1,47 @@
+# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/mtd/partitions/partition.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Partition
+
+description: |
+ This binding describes a single flash partition. Each partition must have its
+ relative offset and size specified. Depending on partition function extra
+ properties can be used.
+
+maintainers:
+ - Rafał Miłecki <rafal@milecki.pl>
+
+properties:
+ reg:
+ description: partition's offset and size within the flash
+ maxItems: 1
+
+ label:
+ description: The label / name for this partition. If omitted, the label
+ is taken from the node name (excluding the unit address).
+
+ read-only:
+ description: This parameter, if present, is a hint that this partition
+ should only be mounted read-only. This is usually used for flash
+ partitions containing early-boot firmware images or data which should
+ not be clobbered.
+ type: boolean
+
+ lock:
+ description: Do not unlock the partition at initialization time (not
+ supported on all devices)
+ type: boolean
+
+ slc-mode:
+ description: This parameter, if present, allows one to emulate SLC mode
+ on a partition attached to an MLC NAND thus making this partition
+ immune to paired-pages corruptions
+ type: boolean
+
+required:
+ - reg
+
+additionalProperties: true

View File

@ -1,92 +0,0 @@
From 6e9dff6fe3fbc452f16566e4a7e293b0decefdba Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= <rafal@milecki.pl>
Date: Thu, 11 Feb 2021 23:04:26 +0100
Subject: [PATCH] dt-bindings: mtd: add binding for BCM4908 partitions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
BCM4908 uses fixed partitions layout but function of some partitions may
vary. Some devices use multiple firmware partitions and those partitions
should be marked to let system discover their purpose.
Signed-off-by: Rafał Miłecki <rafal@milecki.pl>
Signed-off-by: Richard Weinberger <richard@nod.at>
---
.../partitions/brcm,bcm4908-partitions.yaml | 70 +++++++++++++++++++
1 file changed, 70 insertions(+)
create mode 100644 Documentation/devicetree/bindings/mtd/partitions/brcm,bcm4908-partitions.yaml
--- /dev/null
+++ b/Documentation/devicetree/bindings/mtd/partitions/brcm,bcm4908-partitions.yaml
@@ -0,0 +1,70 @@
+# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/mtd/partitions/brcm,bcm4908-partitions.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Broadcom BCM4908 partitioning
+
+description: |
+ Broadcom BCM4908 CFE bootloader supports two firmware partitions. One is used
+ for regular booting, the other is treated as fallback.
+
+ This binding allows defining all fixed partitions and marking those containing
+ firmware. System can use that information e.g. for booting or flashing
+ purposes.
+
+maintainers:
+ - Rafał Miłecki <rafal@milecki.pl>
+
+properties:
+ compatible:
+ const: brcm,bcm4908-partitions
+
+ "#address-cells":
+ enum: [ 1, 2 ]
+
+ "#size-cells":
+ enum: [ 1, 2 ]
+
+patternProperties:
+ "^partition@[0-9a-f]+$":
+ $ref: "partition.yaml#"
+ properties:
+ compatible:
+ const: brcm,bcm4908-firmware
+ unevaluatedProperties: false
+
+required:
+ - "#address-cells"
+ - "#size-cells"
+
+additionalProperties: false
+
+examples:
+ - |
+ partitions {
+ compatible = "brcm,bcm4908-partitions";
+ #address-cells = <1>;
+ #size-cells = <1>;
+
+ partition@0 {
+ label = "cferom";
+ reg = <0x0 0x100000>;
+ };
+
+ partition@100000 {
+ compatible = "brcm,bcm4908-firmware";
+ reg = <0x100000 0xf00000>;
+ };
+
+ partition@1000000 {
+ compatible = "brcm,bcm4908-firmware";
+ reg = <0x1000000 0xf00000>;
+ };
+
+ partition@1f00000 {
+ label = "calibration";
+ reg = <0x1f00000 0x100000>;
+ };
+ };

View File

@ -1,654 +0,0 @@
From afbef8efb591792579c633a7c545f914c6165f82 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= <rafal@milecki.pl>
Date: Thu, 11 Feb 2021 23:04:27 +0100
Subject: [PATCH] mtd: parsers: ofpart: support BCM4908 fixed partitions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Some devices use fixed partitioning with some partitions requiring some
extra logic. E.g. BCM4908 may have multiple firmware partitions but
detecting currently used one requires checking bootloader parameters.
To support such cases without duplicating a lot of code (without copying
most of the ofpart.c code) support for post-parsing callback was added.
BCM4908 support in ofpart can be enabled using config option and results
in compiling & executing a specific callback. It simply reads offset of
currently used firmware partition from the DT. Bootloader specifies it
using the "brcm_blparms" property.
Signed-off-by: Rafał Miłecki <rafal@milecki.pl>
---
drivers/mtd/parsers/Kconfig | 9 +++
drivers/mtd/parsers/Makefile | 2 +
drivers/mtd/parsers/ofpart_bcm4908.c | 64 +++++++++++++++++++
drivers/mtd/parsers/ofpart_bcm4908.h | 15 +++++
.../mtd/parsers/{ofpart.c => ofpart_core.c} | 28 +++++++-
5 files changed, 116 insertions(+), 2 deletions(-)
create mode 100644 drivers/mtd/parsers/ofpart_bcm4908.c
create mode 100644 drivers/mtd/parsers/ofpart_bcm4908.h
rename drivers/mtd/parsers/{ofpart.c => ofpart_core.c} (88%)
--- a/drivers/mtd/parsers/Kconfig
+++ b/drivers/mtd/parsers/Kconfig
@@ -67,6 +67,15 @@ config MTD_OF_PARTS
flash memory node, as described in
Documentation/devicetree/bindings/mtd/partition.txt.
+config MTD_OF_PARTS_BCM4908
+ bool "BCM4908 partitioning support"
+ depends on MTD_OF_PARTS && (ARCH_BCM4908 || COMPILE_TEST)
+ default ARCH_BCM4908
+ help
+ This provides partitions parser for BCM4908 family devices
+ that can have multiple "firmware" partitions. It takes care of
+ finding currently used one and backup ones.
+
config MTD_PARSER_IMAGETAG
tristate "Parser for BCM963XX Image Tag format partitions"
depends on BCM63XX || BMIPS_GENERIC || COMPILE_TEST
--- a/drivers/mtd/parsers/Makefile
+++ b/drivers/mtd/parsers/Makefile
@@ -4,6 +4,8 @@ obj-$(CONFIG_MTD_BCM47XX_PARTS) += bcm4
obj-$(CONFIG_MTD_BCM63XX_PARTS) += bcm63xxpart.o
obj-$(CONFIG_MTD_CMDLINE_PARTS) += cmdlinepart.o
obj-$(CONFIG_MTD_OF_PARTS) += ofpart.o
+ofpart-y += ofpart_core.o
+ofpart-$(CONFIG_MTD_OF_PARTS_BCM4908) += ofpart_bcm4908.o
obj-$(CONFIG_MTD_PARSER_IMAGETAG) += parser_imagetag.o
obj-$(CONFIG_MTD_AFS_PARTS) += afs.o
obj-$(CONFIG_MTD_PARSER_TRX) += parser_trx.o
--- /dev/null
+++ b/drivers/mtd/parsers/ofpart_bcm4908.c
@@ -0,0 +1,64 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2021 Rafał Miłecki <rafal@milecki.pl>
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/of.h>
+#include <linux/mtd/mtd.h>
+#include <linux/slab.h>
+#include <linux/mtd/partitions.h>
+
+#include "ofpart_bcm4908.h"
+
+#define BLPARAMS_FW_OFFSET "NAND_RFS_OFS"
+
+static long long bcm4908_partitions_fw_offset(void)
+{
+ struct device_node *root;
+ struct property *prop;
+ const char *s;
+
+ root = of_find_node_by_path("/");
+ if (!root)
+ return -ENOENT;
+
+ of_property_for_each_string(root, "brcm_blparms", prop, s) {
+ size_t len = strlen(BLPARAMS_FW_OFFSET);
+ unsigned long offset;
+ int err;
+
+ if (strncmp(s, BLPARAMS_FW_OFFSET, len) || s[len] != '=')
+ continue;
+
+ err = kstrtoul(s + len + 1, 0, &offset);
+ if (err) {
+ pr_err("failed to parse %s\n", s + len + 1);
+ return err;
+ }
+
+ return offset << 10;
+ }
+
+ return -ENOENT;
+}
+
+int bcm4908_partitions_post_parse(struct mtd_info *mtd, struct mtd_partition *parts, int nr_parts)
+{
+ long long fw_offset;
+ int i;
+
+ fw_offset = bcm4908_partitions_fw_offset();
+
+ for (i = 0; i < nr_parts; i++) {
+ if (of_device_is_compatible(parts[i].of_node, "brcm,bcm4908-firmware")) {
+ if (fw_offset < 0 || parts[i].offset == fw_offset)
+ parts[i].name = "firmware";
+ else
+ parts[i].name = "backup";
+ }
+ }
+
+ return 0;
+}
--- /dev/null
+++ b/drivers/mtd/parsers/ofpart_bcm4908.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __BCM4908_PARTITIONS_H
+#define __BCM4908_PARTITIONS_H
+
+#ifdef CONFIG_MTD_OF_PARTS_BCM4908
+int bcm4908_partitions_post_parse(struct mtd_info *mtd, struct mtd_partition *parts, int nr_parts);
+#else
+static inline int bcm4908_partitions_post_parse(struct mtd_info *mtd, struct mtd_partition *parts,
+ int nr_parts)
+{
+ return -EOPNOTSUPP;
+}
+#endif
+
+#endif
--- a/drivers/mtd/parsers/ofpart.c
+++ /dev/null
@@ -1,239 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * Flash partitions described by the OF (or flattened) device tree
- *
- * Copyright © 2006 MontaVista Software Inc.
- * Author: Vitaly Wool <vwool@ru.mvista.com>
- *
- * Revised to handle newer style flash binding by:
- * Copyright © 2007 David Gibson, IBM Corporation.
- */
-
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/of.h>
-#include <linux/mtd/mtd.h>
-#include <linux/slab.h>
-#include <linux/mtd/partitions.h>
-
-static bool node_has_compatible(struct device_node *pp)
-{
- return of_get_property(pp, "compatible", NULL);
-}
-
-static int parse_fixed_partitions(struct mtd_info *master,
- const struct mtd_partition **pparts,
- struct mtd_part_parser_data *data)
-{
- struct mtd_partition *parts;
- struct device_node *mtd_node;
- struct device_node *ofpart_node;
- const char *partname;
- struct device_node *pp;
- int nr_parts, i, ret = 0;
- bool dedicated = true;
-
-
- /* Pull of_node from the master device node */
- mtd_node = mtd_get_of_node(master);
- if (!mtd_node)
- return 0;
-
- ofpart_node = of_get_child_by_name(mtd_node, "partitions");
- if (!ofpart_node) {
- /*
- * We might get here even when ofpart isn't used at all (e.g.,
- * when using another parser), so don't be louder than
- * KERN_DEBUG
- */
- pr_debug("%s: 'partitions' subnode not found on %pOF. Trying to parse direct subnodes as partitions.\n",
- master->name, mtd_node);
- ofpart_node = mtd_node;
- dedicated = false;
- } else if (!of_device_is_compatible(ofpart_node, "fixed-partitions")) {
- /* The 'partitions' subnode might be used by another parser */
- return 0;
- }
-
- /* First count the subnodes */
- nr_parts = 0;
- for_each_child_of_node(ofpart_node, pp) {
- if (!dedicated && node_has_compatible(pp))
- continue;
-
- nr_parts++;
- }
-
- if (nr_parts == 0)
- return 0;
-
- parts = kcalloc(nr_parts, sizeof(*parts), GFP_KERNEL);
- if (!parts)
- return -ENOMEM;
-
- i = 0;
- for_each_child_of_node(ofpart_node, pp) {
- const __be32 *reg;
- int len;
- int a_cells, s_cells;
-
- if (!dedicated && node_has_compatible(pp))
- continue;
-
- reg = of_get_property(pp, "reg", &len);
- if (!reg) {
- if (dedicated) {
- pr_debug("%s: ofpart partition %pOF (%pOF) missing reg property.\n",
- master->name, pp,
- mtd_node);
- goto ofpart_fail;
- } else {
- nr_parts--;
- continue;
- }
- }
-
- a_cells = of_n_addr_cells(pp);
- s_cells = of_n_size_cells(pp);
- if (len / 4 != a_cells + s_cells) {
- pr_debug("%s: ofpart partition %pOF (%pOF) error parsing reg property.\n",
- master->name, pp,
- mtd_node);
- goto ofpart_fail;
- }
-
- parts[i].offset = of_read_number(reg, a_cells);
- parts[i].size = of_read_number(reg + a_cells, s_cells);
- parts[i].of_node = pp;
-
- partname = of_get_property(pp, "label", &len);
- if (!partname)
- partname = of_get_property(pp, "name", &len);
- parts[i].name = partname;
-
- if (of_get_property(pp, "read-only", &len))
- parts[i].mask_flags |= MTD_WRITEABLE;
-
- if (of_get_property(pp, "lock", &len))
- parts[i].mask_flags |= MTD_POWERUP_LOCK;
-
- if (of_property_read_bool(pp, "slc-mode"))
- parts[i].add_flags |= MTD_SLC_ON_MLC_EMULATION;
-
- i++;
- }
-
- if (!nr_parts)
- goto ofpart_none;
-
- *pparts = parts;
- return nr_parts;
-
-ofpart_fail:
- pr_err("%s: error parsing ofpart partition %pOF (%pOF)\n",
- master->name, pp, mtd_node);
- ret = -EINVAL;
-ofpart_none:
- of_node_put(pp);
- kfree(parts);
- return ret;
-}
-
-static const struct of_device_id parse_ofpart_match_table[] = {
- { .compatible = "fixed-partitions" },
- {},
-};
-MODULE_DEVICE_TABLE(of, parse_ofpart_match_table);
-
-static struct mtd_part_parser ofpart_parser = {
- .parse_fn = parse_fixed_partitions,
- .name = "fixed-partitions",
- .of_match_table = parse_ofpart_match_table,
-};
-
-static int parse_ofoldpart_partitions(struct mtd_info *master,
- const struct mtd_partition **pparts,
- struct mtd_part_parser_data *data)
-{
- struct mtd_partition *parts;
- struct device_node *dp;
- int i, plen, nr_parts;
- const struct {
- __be32 offset, len;
- } *part;
- const char *names;
-
- /* Pull of_node from the master device node */
- dp = mtd_get_of_node(master);
- if (!dp)
- return 0;
-
- part = of_get_property(dp, "partitions", &plen);
- if (!part)
- return 0; /* No partitions found */
-
- pr_warn("Device tree uses obsolete partition map binding: %pOF\n", dp);
-
- nr_parts = plen / sizeof(part[0]);
-
- parts = kcalloc(nr_parts, sizeof(*parts), GFP_KERNEL);
- if (!parts)
- return -ENOMEM;
-
- names = of_get_property(dp, "partition-names", &plen);
-
- for (i = 0; i < nr_parts; i++) {
- parts[i].offset = be32_to_cpu(part->offset);
- parts[i].size = be32_to_cpu(part->len) & ~1;
- /* bit 0 set signifies read only partition */
- if (be32_to_cpu(part->len) & 1)
- parts[i].mask_flags = MTD_WRITEABLE;
-
- if (names && (plen > 0)) {
- int len = strlen(names) + 1;
-
- parts[i].name = names;
- plen -= len;
- names += len;
- } else {
- parts[i].name = "unnamed";
- }
-
- part++;
- }
-
- *pparts = parts;
- return nr_parts;
-}
-
-static struct mtd_part_parser ofoldpart_parser = {
- .parse_fn = parse_ofoldpart_partitions,
- .name = "ofoldpart",
-};
-
-static int __init ofpart_parser_init(void)
-{
- register_mtd_parser(&ofpart_parser);
- register_mtd_parser(&ofoldpart_parser);
- return 0;
-}
-
-static void __exit ofpart_parser_exit(void)
-{
- deregister_mtd_parser(&ofpart_parser);
- deregister_mtd_parser(&ofoldpart_parser);
-}
-
-module_init(ofpart_parser_init);
-module_exit(ofpart_parser_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("Parser for MTD partitioning information in device tree");
-MODULE_AUTHOR("Vitaly Wool, David Gibson");
-/*
- * When MTD core cannot find the requested parser, it tries to load the module
- * with the same name. Since we provide the ofoldpart parser, we should have
- * the corresponding alias.
- */
-MODULE_ALIAS("fixed-partitions");
-MODULE_ALIAS("ofoldpart");
--- /dev/null
+++ b/drivers/mtd/parsers/ofpart_core.c
@@ -0,0 +1,263 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Flash partitions described by the OF (or flattened) device tree
+ *
+ * Copyright © 2006 MontaVista Software Inc.
+ * Author: Vitaly Wool <vwool@ru.mvista.com>
+ *
+ * Revised to handle newer style flash binding by:
+ * Copyright © 2007 David Gibson, IBM Corporation.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/of.h>
+#include <linux/mtd/mtd.h>
+#include <linux/slab.h>
+#include <linux/mtd/partitions.h>
+
+#include "ofpart_bcm4908.h"
+
+struct fixed_partitions_quirks {
+ int (*post_parse)(struct mtd_info *mtd, struct mtd_partition *parts, int nr_parts);
+};
+
+struct fixed_partitions_quirks bcm4908_partitions_quirks = {
+ .post_parse = bcm4908_partitions_post_parse,
+};
+
+static const struct of_device_id parse_ofpart_match_table[];
+
+static bool node_has_compatible(struct device_node *pp)
+{
+ return of_get_property(pp, "compatible", NULL);
+}
+
+static int parse_fixed_partitions(struct mtd_info *master,
+ const struct mtd_partition **pparts,
+ struct mtd_part_parser_data *data)
+{
+ const struct fixed_partitions_quirks *quirks;
+ const struct of_device_id *of_id;
+ struct mtd_partition *parts;
+ struct device_node *mtd_node;
+ struct device_node *ofpart_node;
+ const char *partname;
+ struct device_node *pp;
+ int nr_parts, i, ret = 0;
+ bool dedicated = true;
+
+ /* Pull of_node from the master device node */
+ mtd_node = mtd_get_of_node(master);
+ if (!mtd_node)
+ return 0;
+
+ ofpart_node = of_get_child_by_name(mtd_node, "partitions");
+ if (!ofpart_node) {
+ /*
+ * We might get here even when ofpart isn't used at all (e.g.,
+ * when using another parser), so don't be louder than
+ * KERN_DEBUG
+ */
+ pr_debug("%s: 'partitions' subnode not found on %pOF. Trying to parse direct subnodes as partitions.\n",
+ master->name, mtd_node);
+ ofpart_node = mtd_node;
+ dedicated = false;
+ }
+
+ of_id = of_match_node(parse_ofpart_match_table, ofpart_node);
+ if (dedicated && !of_id) {
+ /* The 'partitions' subnode might be used by another parser */
+ return 0;
+ }
+
+ quirks = of_id ? of_id->data : NULL;
+
+ /* First count the subnodes */
+ nr_parts = 0;
+ for_each_child_of_node(ofpart_node, pp) {
+ if (!dedicated && node_has_compatible(pp))
+ continue;
+
+ nr_parts++;
+ }
+
+ if (nr_parts == 0)
+ return 0;
+
+ parts = kcalloc(nr_parts, sizeof(*parts), GFP_KERNEL);
+ if (!parts)
+ return -ENOMEM;
+
+ i = 0;
+ for_each_child_of_node(ofpart_node, pp) {
+ const __be32 *reg;
+ int len;
+ int a_cells, s_cells;
+
+ if (!dedicated && node_has_compatible(pp))
+ continue;
+
+ reg = of_get_property(pp, "reg", &len);
+ if (!reg) {
+ if (dedicated) {
+ pr_debug("%s: ofpart partition %pOF (%pOF) missing reg property.\n",
+ master->name, pp,
+ mtd_node);
+ goto ofpart_fail;
+ } else {
+ nr_parts--;
+ continue;
+ }
+ }
+
+ a_cells = of_n_addr_cells(pp);
+ s_cells = of_n_size_cells(pp);
+ if (len / 4 != a_cells + s_cells) {
+ pr_debug("%s: ofpart partition %pOF (%pOF) error parsing reg property.\n",
+ master->name, pp,
+ mtd_node);
+ goto ofpart_fail;
+ }
+
+ parts[i].offset = of_read_number(reg, a_cells);
+ parts[i].size = of_read_number(reg + a_cells, s_cells);
+ parts[i].of_node = pp;
+
+ partname = of_get_property(pp, "label", &len);
+ if (!partname)
+ partname = of_get_property(pp, "name", &len);
+ parts[i].name = partname;
+
+ if (of_get_property(pp, "read-only", &len))
+ parts[i].mask_flags |= MTD_WRITEABLE;
+
+ if (of_get_property(pp, "lock", &len))
+ parts[i].mask_flags |= MTD_POWERUP_LOCK;
+
+ if (of_property_read_bool(pp, "slc-mode"))
+ parts[i].add_flags |= MTD_SLC_ON_MLC_EMULATION;
+
+ i++;
+ }
+
+ if (!nr_parts)
+ goto ofpart_none;
+
+ if (quirks && quirks->post_parse)
+ quirks->post_parse(master, parts, nr_parts);
+
+ *pparts = parts;
+ return nr_parts;
+
+ofpart_fail:
+ pr_err("%s: error parsing ofpart partition %pOF (%pOF)\n",
+ master->name, pp, mtd_node);
+ ret = -EINVAL;
+ofpart_none:
+ of_node_put(pp);
+ kfree(parts);
+ return ret;
+}
+
+static const struct of_device_id parse_ofpart_match_table[] = {
+ /* Generic */
+ { .compatible = "fixed-partitions" },
+ /* Customized */
+ { .compatible = "brcm,bcm4908-partitions", .data = &bcm4908_partitions_quirks, },
+ {},
+};
+MODULE_DEVICE_TABLE(of, parse_ofpart_match_table);
+
+static struct mtd_part_parser ofpart_parser = {
+ .parse_fn = parse_fixed_partitions,
+ .name = "fixed-partitions",
+ .of_match_table = parse_ofpart_match_table,
+};
+
+static int parse_ofoldpart_partitions(struct mtd_info *master,
+ const struct mtd_partition **pparts,
+ struct mtd_part_parser_data *data)
+{
+ struct mtd_partition *parts;
+ struct device_node *dp;
+ int i, plen, nr_parts;
+ const struct {
+ __be32 offset, len;
+ } *part;
+ const char *names;
+
+ /* Pull of_node from the master device node */
+ dp = mtd_get_of_node(master);
+ if (!dp)
+ return 0;
+
+ part = of_get_property(dp, "partitions", &plen);
+ if (!part)
+ return 0; /* No partitions found */
+
+ pr_warn("Device tree uses obsolete partition map binding: %pOF\n", dp);
+
+ nr_parts = plen / sizeof(part[0]);
+
+ parts = kcalloc(nr_parts, sizeof(*parts), GFP_KERNEL);
+ if (!parts)
+ return -ENOMEM;
+
+ names = of_get_property(dp, "partition-names", &plen);
+
+ for (i = 0; i < nr_parts; i++) {
+ parts[i].offset = be32_to_cpu(part->offset);
+ parts[i].size = be32_to_cpu(part->len) & ~1;
+ /* bit 0 set signifies read only partition */
+ if (be32_to_cpu(part->len) & 1)
+ parts[i].mask_flags = MTD_WRITEABLE;
+
+ if (names && (plen > 0)) {
+ int len = strlen(names) + 1;
+
+ parts[i].name = names;
+ plen -= len;
+ names += len;
+ } else {
+ parts[i].name = "unnamed";
+ }
+
+ part++;
+ }
+
+ *pparts = parts;
+ return nr_parts;
+}
+
+static struct mtd_part_parser ofoldpart_parser = {
+ .parse_fn = parse_ofoldpart_partitions,
+ .name = "ofoldpart",
+};
+
+static int __init ofpart_parser_init(void)
+{
+ register_mtd_parser(&ofpart_parser);
+ register_mtd_parser(&ofoldpart_parser);
+ return 0;
+}
+
+static void __exit ofpart_parser_exit(void)
+{
+ deregister_mtd_parser(&ofpart_parser);
+ deregister_mtd_parser(&ofoldpart_parser);
+}
+
+module_init(ofpart_parser_init);
+module_exit(ofpart_parser_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Parser for MTD partitioning information in device tree");
+MODULE_AUTHOR("Vitaly Wool, David Gibson");
+/*
+ * When MTD core cannot find the requested parser, it tries to load the module
+ * with the same name. Since we provide the ofoldpart parser, we should have
+ * the corresponding alias.
+ */
+MODULE_ALIAS("fixed-partitions");
+MODULE_ALIAS("ofoldpart");

View File

@ -1,69 +0,0 @@
From 2d751203aacf86a1b301a188d8551c7da91043ab Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= <rafal@milecki.pl>
Date: Tue, 2 Mar 2021 20:00:12 +0100
Subject: [PATCH] mtd: parsers: ofpart: limit parsing of deprecated DT syntax
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
For backward compatibility ofpart still supports the old syntax like:
spi-flash@0 {
compatible = "jedec,spi-nor";
reg = <0x0>;
partition@0 {
label = "bootloader";
reg = <0x0 0x100000>;
};
};
(without "partitions" subnode).
There is no reason however to support nested partitions without a clear
"compatible" string like:
partitions {
compatible = "fixed-partitions";
#address-cells = <1>;
#size-cells = <1>;
partition@0 {
label = "bootloader";
reg = <0x0 0x100000>;
partition@0 {
label = "config";
reg = <0x80000 0x80000>;
};
};
};
(we never officially supported or documented that).
Make sure ofpart doesn't attempt to parse above.
Cc: Ansuel Smith <ansuelsmth@gmail.com>
Signed-off-by: Rafał Miłecki <rafal@milecki.pl>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lore.kernel.org/linux-mtd/20210302190012.1255-1-zajec5@gmail.com
---
drivers/mtd/parsers/ofpart_core.c | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
--- a/drivers/mtd/parsers/ofpart_core.c
+++ b/drivers/mtd/parsers/ofpart_core.c
@@ -53,7 +53,7 @@ static int parse_fixed_partitions(struct
return 0;
ofpart_node = of_get_child_by_name(mtd_node, "partitions");
- if (!ofpart_node) {
+ if (!ofpart_node && !master->parent) {
/*
* We might get here even when ofpart isn't used at all (e.g.,
* when using another parser), so don't be louder than
@@ -64,6 +64,8 @@ static int parse_fixed_partitions(struct
ofpart_node = mtd_node;
dedicated = false;
}
+ if (!ofpart_node)
+ return 0;
of_id = of_match_node(parse_ofpart_match_table, ofpart_node);
if (dedicated && !of_id) {

View File

@ -1,34 +0,0 @@
From b87b6d2d6f540e29c3f98e1572d64e560d73d6c1 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <weiyongjun1@huawei.com>
Date: Thu, 4 Mar 2021 06:46:00 +0000
Subject: [PATCH] mtd: parsers: ofpart: make symbol 'bcm4908_partitions_quirks'
static
The sparse tool complains as follows:
drivers/mtd/parsers/ofpart_core.c:25:32: warning:
symbol 'bcm4908_partitions_quirks' was not declared. Should it be static?
This symbol is not used outside of ofpart_core.c, so this
commit marks it static.
Fixes: 457da931b608 ("mtd: parsers: ofpart: support BCM4908 fixed partitions")
Reported-by: Hulk Robot <hulkci@huawei.com>
Signed-off-by: Wei Yongjun <weiyongjun1@huawei.com>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lore.kernel.org/linux-mtd/20210304064600.3279138-1-weiyongjun1@huawei.com
---
drivers/mtd/parsers/ofpart_core.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/drivers/mtd/parsers/ofpart_core.c
+++ b/drivers/mtd/parsers/ofpart_core.c
@@ -22,7 +22,7 @@ struct fixed_partitions_quirks {
int (*post_parse)(struct mtd_info *mtd, struct mtd_partition *parts, int nr_parts);
};
-struct fixed_partitions_quirks bcm4908_partitions_quirks = {
+static struct fixed_partitions_quirks bcm4908_partitions_quirks = {
.post_parse = bcm4908_partitions_post_parse,
};

View File

@ -1,38 +0,0 @@
From a5d83d6e2bc747b13f347962d4b335d70b23559b Mon Sep 17 00:00:00 2001
From: Ansuel Smith <ansuelsmth@gmail.com>
Date: Fri, 12 Mar 2021 07:28:19 +0100
Subject: [PATCH] mtd: core: add nvmem-cells compatible to parse mtd as nvmem
cells
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Partitions that contains the nvmem-cells compatible will register
their direct subonodes as nvmem cells and the node will be treated as a
nvmem provider.
Signed-off-by: Ansuel Smith <ansuelsmth@gmail.com>
Tested-by: Rafał Miłecki <rafal@milecki.pl>
---
drivers/mtd/mtdcore.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
--- a/drivers/mtd/mtdcore.c
+++ b/drivers/mtd/mtdcore.c
@@ -531,6 +531,7 @@ static int mtd_nvmem_reg_read(void *priv
static int mtd_nvmem_add(struct mtd_info *mtd)
{
+ struct device_node *node = mtd_get_of_node(mtd);
struct nvmem_config config = {};
config.id = -1;
@@ -543,7 +544,7 @@ static int mtd_nvmem_add(struct mtd_info
config.stride = 1;
config.read_only = true;
config.root_only = true;
- config.no_of_node = true;
+ config.no_of_node = !of_device_is_compatible(node, "nvmem-cells");
config.priv = mtd;
mtd->nvmem = nvmem_register(&config);

View File

@ -1,25 +0,0 @@
From 42645976c3289b03a12f1bd2bc131fd98fc27170 Mon Sep 17 00:00:00 2001
From: Ansuel Smith <ansuelsmth@gmail.com>
Date: Fri, 12 Mar 2021 07:28:20 +0100
Subject: [PATCH] devicetree: nvmem: nvmem: drop $nodename restriction
Drop $nodename restriction as now mtd partition can also be used as
nvmem provider.
Signed-off-by: Ansuel Smith <ansuelsmth@gmail.com>
---
Documentation/devicetree/bindings/nvmem/nvmem.yaml | 3 ---
1 file changed, 3 deletions(-)
--- a/Documentation/devicetree/bindings/nvmem/nvmem.yaml
+++ b/Documentation/devicetree/bindings/nvmem/nvmem.yaml
@@ -20,9 +20,6 @@ description: |
storage device.
properties:
- $nodename:
- pattern: "^(eeprom|efuse|nvram)(@.*|-[0-9a-f])*$"
-
"#address-cells":
const: 1

View File

@ -1,117 +0,0 @@
From 377aa0135dc8489312edd3184d143ce3a89ff7ee Mon Sep 17 00:00:00 2001
From: Ansuel Smith <ansuelsmth@gmail.com>
Date: Fri, 12 Mar 2021 07:28:21 +0100
Subject: [PATCH] dt-bindings: mtd: Document use of nvmem-cells compatible
Document nvmem-cells compatible used to treat mtd partitions as a
nvmem provider.
Signed-off-by: Ansuel Smith <ansuelsmth@gmail.com>
Reviewed-by: Rob Herring <robh@kernel.org>
---
.../bindings/mtd/partitions/nvmem-cells.yaml | 99 +++++++++++++++++++
1 file changed, 99 insertions(+)
create mode 100644 Documentation/devicetree/bindings/mtd/partitions/nvmem-cells.yaml
--- /dev/null
+++ b/Documentation/devicetree/bindings/mtd/partitions/nvmem-cells.yaml
@@ -0,0 +1,99 @@
+# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/mtd/partitions/nvmem-cells.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Nvmem cells
+
+description: |
+ Any partition containing the compatible "nvmem-cells" will register as a
+ nvmem provider.
+ Each direct subnodes represents a nvmem cell following the nvmem binding.
+ Nvmem binding to declare nvmem-cells can be found in:
+ Documentation/devicetree/bindings/nvmem/nvmem.yaml
+
+maintainers:
+ - Ansuel Smith <ansuelsmth@gmail.com>
+
+allOf:
+ - $ref: /schemas/nvmem/nvmem.yaml#
+
+properties:
+ compatible:
+ const: nvmem-cells
+
+required:
+ - compatible
+
+additionalProperties: true
+
+examples:
+ - |
+ partitions {
+ compatible = "fixed-partitions";
+ #address-cells = <1>;
+ #size-cells = <1>;
+
+ /* ... */
+
+ };
+ art: art@1200000 {
+ compatible = "nvmem-cells";
+ reg = <0x1200000 0x0140000>;
+ label = "art";
+ read-only;
+ #address-cells = <1>;
+ #size-cells = <1>;
+
+ macaddr_gmac1: macaddr_gmac1@0 {
+ reg = <0x0 0x6>;
+ };
+
+ macaddr_gmac2: macaddr_gmac2@6 {
+ reg = <0x6 0x6>;
+ };
+
+ pre_cal_24g: pre_cal_24g@1000 {
+ reg = <0x1000 0x2f20>;
+ };
+
+ pre_cal_5g: pre_cal_5g@5000{
+ reg = <0x5000 0x2f20>;
+ };
+ };
+ - |
+ partitions {
+ compatible = "fixed-partitions";
+ #address-cells = <1>;
+ #size-cells = <1>;
+
+ partition@0 {
+ label = "bootloader";
+ reg = <0x000000 0x100000>;
+ read-only;
+ };
+
+ firmware@100000 {
+ compatible = "brcm,trx";
+ label = "firmware";
+ reg = <0x100000 0xe00000>;
+ };
+
+ calibration@f00000 {
+ compatible = "nvmem-cells";
+ label = "calibration";
+ reg = <0xf00000 0x100000>;
+ ranges = <0 0xf00000 0x100000>;
+ #address-cells = <1>;
+ #size-cells = <1>;
+
+ wifi0@0 {
+ reg = <0x000000 0x080000>;
+ };
+
+ wifi1@80000 {
+ reg = <0x080000 0x080000>;
+ };
+ };
+ };

View File

@ -1,98 +0,0 @@
From 2fa7294175c76e1ec568aa75c1891fd908728c8d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= <rafal@milecki.pl>
Date: Fri, 12 Mar 2021 14:49:18 +0100
Subject: [PATCH] dt-bindings: mtd: add binding for Linksys Northstar
partitions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Linksys on Broadcom Northstar devices uses fixed flash layout with
multiple firmware partitions.
Signed-off-by: Rafał Miłecki <rafal@milecki.pl>
Reviewed-by: Rob Herring <robh@kernel.org>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lore.kernel.org/linux-mtd/20210312134919.7767-1-zajec5@gmail.com
---
.../mtd/partitions/linksys,ns-partitions.yaml | 74 +++++++++++++++++++
1 file changed, 74 insertions(+)
create mode 100644 Documentation/devicetree/bindings/mtd/partitions/linksys,ns-partitions.yaml
--- /dev/null
+++ b/Documentation/devicetree/bindings/mtd/partitions/linksys,ns-partitions.yaml
@@ -0,0 +1,74 @@
+# SPDX-License-Identifier: GPL-2.0-only OR BSD-2-Clause
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/mtd/partitions/linksys,ns-partitions.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Linksys Northstar partitioning
+
+description: |
+ Linksys devices based on Broadcom Northstar architecture often use two
+ firmware partitions. One is used for regular booting, the other is treated as
+ fallback.
+
+ This binding allows defining all fixed partitions and marking those containing
+ firmware. System can use that information e.g. for booting or flashing
+ purposes.
+
+maintainers:
+ - Rafał Miłecki <rafal@milecki.pl>
+
+properties:
+ compatible:
+ const: linksys,ns-partitions
+
+ "#address-cells":
+ enum: [ 1, 2 ]
+
+ "#size-cells":
+ enum: [ 1, 2 ]
+
+patternProperties:
+ "^partition@[0-9a-f]+$":
+ $ref: "partition.yaml#"
+ properties:
+ compatible:
+ items:
+ - const: linksys,ns-firmware
+ - const: brcm,trx
+ unevaluatedProperties: false
+
+required:
+ - "#address-cells"
+ - "#size-cells"
+
+additionalProperties: false
+
+examples:
+ - |
+ partitions {
+ compatible = "linksys,ns-partitions";
+ #address-cells = <1>;
+ #size-cells = <1>;
+
+ partition@0 {
+ label = "boot";
+ reg = <0x0 0x100000>;
+ read-only;
+ };
+
+ partition@100000 {
+ label = "nvram";
+ reg = <0x100000 0x100000>;
+ };
+
+ partition@200000 {
+ compatible = "linksys,ns-firmware", "brcm,trx";
+ reg = <0x200000 0xf00000>;
+ };
+
+ partition@1100000 {
+ compatible = "linksys,ns-firmware", "brcm,trx";
+ reg = <0x1100000 0xf00000>;
+ };
+ };

View File

@ -1,156 +0,0 @@
From 7134a2d026d942210b4d26d6059c9d979ca7866e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= <rafal@milecki.pl>
Date: Fri, 12 Mar 2021 14:49:19 +0100
Subject: [PATCH] mtd: parsers: ofpart: support Linksys Northstar partitions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
This allows extending ofpart parser with support for Linksys Northstar
devices. That support uses recently added quirks mechanism.
Signed-off-by: Rafał Miłecki <rafal@milecki.pl>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lore.kernel.org/linux-mtd/20210312134919.7767-2-zajec5@gmail.com
---
drivers/mtd/parsers/Kconfig | 10 +++++
drivers/mtd/parsers/Makefile | 1 +
drivers/mtd/parsers/ofpart_core.c | 6 +++
drivers/mtd/parsers/ofpart_linksys_ns.c | 50 +++++++++++++++++++++++++
drivers/mtd/parsers/ofpart_linksys_ns.h | 18 +++++++++
5 files changed, 85 insertions(+)
create mode 100644 drivers/mtd/parsers/ofpart_linksys_ns.c
create mode 100644 drivers/mtd/parsers/ofpart_linksys_ns.h
--- a/drivers/mtd/parsers/Kconfig
+++ b/drivers/mtd/parsers/Kconfig
@@ -76,6 +76,16 @@ config MTD_OF_PARTS_BCM4908
that can have multiple "firmware" partitions. It takes care of
finding currently used one and backup ones.
+config MTD_OF_PARTS_LINKSYS_NS
+ bool "Linksys Northstar partitioning support"
+ depends on MTD_OF_PARTS && (ARCH_BCM_5301X || ARCH_BCM4908 || COMPILE_TEST)
+ default ARCH_BCM_5301X
+ help
+ This provides partitions parser for Linksys devices based on Broadcom
+ Northstar architecture. Linksys commonly uses fixed flash layout with
+ two "firmware" partitions. Currently used firmware has to be detected
+ using CFE environment variable.
+
config MTD_PARSER_IMAGETAG
tristate "Parser for BCM963XX Image Tag format partitions"
depends on BCM63XX || BMIPS_GENERIC || COMPILE_TEST
--- a/drivers/mtd/parsers/Makefile
+++ b/drivers/mtd/parsers/Makefile
@@ -6,6 +6,7 @@ obj-$(CONFIG_MTD_CMDLINE_PARTS) += cmdl
obj-$(CONFIG_MTD_OF_PARTS) += ofpart.o
ofpart-y += ofpart_core.o
ofpart-$(CONFIG_MTD_OF_PARTS_BCM4908) += ofpart_bcm4908.o
+ofpart-$(CONFIG_MTD_OF_PARTS_LINKSYS_NS)+= ofpart_linksys_ns.o
obj-$(CONFIG_MTD_PARSER_IMAGETAG) += parser_imagetag.o
obj-$(CONFIG_MTD_AFS_PARTS) += afs.o
obj-$(CONFIG_MTD_PARSER_TRX) += parser_trx.o
--- a/drivers/mtd/parsers/ofpart_core.c
+++ b/drivers/mtd/parsers/ofpart_core.c
@@ -17,6 +17,7 @@
#include <linux/mtd/partitions.h>
#include "ofpart_bcm4908.h"
+#include "ofpart_linksys_ns.h"
struct fixed_partitions_quirks {
int (*post_parse)(struct mtd_info *mtd, struct mtd_partition *parts, int nr_parts);
@@ -26,6 +27,10 @@ static struct fixed_partitions_quirks bc
.post_parse = bcm4908_partitions_post_parse,
};
+static struct fixed_partitions_quirks linksys_ns_partitions_quirks = {
+ .post_parse = linksys_ns_partitions_post_parse,
+};
+
static const struct of_device_id parse_ofpart_match_table[];
static bool node_has_compatible(struct device_node *pp)
@@ -167,6 +172,7 @@ static const struct of_device_id parse_o
{ .compatible = "fixed-partitions" },
/* Customized */
{ .compatible = "brcm,bcm4908-partitions", .data = &bcm4908_partitions_quirks, },
+ { .compatible = "linksys,ns-partitions", .data = &linksys_ns_partitions_quirks, },
{},
};
MODULE_DEVICE_TABLE(of, parse_ofpart_match_table);
--- /dev/null
+++ b/drivers/mtd/parsers/ofpart_linksys_ns.c
@@ -0,0 +1,50 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2021 Rafał Miłecki <rafal@milecki.pl>
+ */
+
+#include <linux/bcm47xx_nvram.h>
+#include <linux/mtd/mtd.h>
+#include <linux/mtd/partitions.h>
+
+#include "ofpart_linksys_ns.h"
+
+#define NVRAM_BOOT_PART "bootpartition"
+
+static int ofpart_linksys_ns_bootpartition(void)
+{
+ char buf[4];
+ int bootpartition;
+
+ /* Check CFE environment variable */
+ if (bcm47xx_nvram_getenv(NVRAM_BOOT_PART, buf, sizeof(buf)) > 0) {
+ if (!kstrtoint(buf, 0, &bootpartition))
+ return bootpartition;
+ pr_warn("Failed to parse %s value \"%s\"\n", NVRAM_BOOT_PART,
+ buf);
+ } else {
+ pr_warn("Failed to get NVRAM \"%s\"\n", NVRAM_BOOT_PART);
+ }
+
+ return 0;
+}
+
+int linksys_ns_partitions_post_parse(struct mtd_info *mtd,
+ struct mtd_partition *parts,
+ int nr_parts)
+{
+ int bootpartition = ofpart_linksys_ns_bootpartition();
+ int trx_idx = 0;
+ int i;
+
+ for (i = 0; i < nr_parts; i++) {
+ if (of_device_is_compatible(parts[i].of_node, "linksys,ns-firmware")) {
+ if (trx_idx++ == bootpartition)
+ parts[i].name = "firmware";
+ else
+ parts[i].name = "backup";
+ }
+ }
+
+ return 0;
+}
--- /dev/null
+++ b/drivers/mtd/parsers/ofpart_linksys_ns.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __OFPART_LINKSYS_NS_H
+#define __OFPART_LINKSYS_NS_H
+
+#ifdef CONFIG_MTD_OF_PARTS_LINKSYS_NS
+int linksys_ns_partitions_post_parse(struct mtd_info *mtd,
+ struct mtd_partition *parts,
+ int nr_parts);
+#else
+static inline int linksys_ns_partitions_post_parse(struct mtd_info *mtd,
+ struct mtd_partition *parts,
+ int nr_parts)
+{
+ return -EOPNOTSUPP;
+}
+#endif
+
+#endif

View File

@ -1,54 +0,0 @@
From 7e4404113686868858a34210c28ae122e967aa64 Mon Sep 17 00:00:00 2001
From: Mauri Sandberg <sandberg@mailfence.com>
Date: Tue, 9 Mar 2021 19:48:59 +0200
Subject: [PATCH] mtd: cfi_cmdset_0002: Disable buffered writes for AMD chip
0x2201
Buffer writes do not work with AMD chip 0x2201. The chip in question
is a AMD/Spansion/Cypress Semiconductor S29GL256N and datasheet [1]
talks about writing buffers being possible. While waiting for a neater
solution resort to writing word-sized chunks only.
Without the patch kernel logs will be flooded with entries like below:
jffs2_scan_eraseblock(): End of filesystem marker found at 0x0
jffs2_build_filesystem(): unlocking the mtd device...
done.
jffs2_build_filesystem(): erasing all blocks after the end marker...
MTD do_write_buffer_wait(): software timeout, address:0x01ec000a.
jffs2: Write clean marker to block at 0x01920000 failed: -5
MTD do_write_buffer_wait(): software timeout, address:0x01e2000a.
jffs2: Write clean marker to block at 0x01880000 failed: -5
MTD do_write_buffer_wait(): software timeout, address:0x01e0000a.
jffs2: Write clean marker to block at 0x01860000 failed: -5
MTD do_write_buffer_wait(): software timeout, address:0x01dc000a.
jffs2: Write clean marker to block at 0x01820000 failed: -5
MTD do_write_buffer_wait(): software timeout, address:0x01da000a.
jffs2: Write clean marker to block at 0x01800000 failed: -5
...
Tested on a Buffalo wzr-hp-g300nh running kernel 5.10.16.
[1] https://www.cypress.com/file/219941/download
or https://datasheetspdf.com/pdf-file/565708/SPANSION/S29GL256N/1
Signed-off-by: Mauri Sandberg <sandberg@mailfence.com>
Signed-off-by: Vignesh Raghavendra <vigneshr@ti.com>
Link: https://lore.kernel.org/r/20210309174859.362060-1-sandberg@mailfence.com
---
drivers/mtd/chips/cfi_cmdset_0002.c | 4 ++++
1 file changed, 4 insertions(+)
--- a/drivers/mtd/chips/cfi_cmdset_0002.c
+++ b/drivers/mtd/chips/cfi_cmdset_0002.c
@@ -272,6 +272,10 @@ static void fixup_use_write_buffers(stru
{
struct map_info *map = mtd->priv;
struct cfi_private *cfi = map->fldrv_priv;
+
+ if (cfi->mfr == CFI_MFR_AMD && cfi->id == 0x2201)
+ return;
+
if (cfi->cfiq->BufWriteTimeoutTyp) {
pr_debug("Using buffer write method\n");
mtd->_write = cfi_amdstd_write_buffers;

View File

@ -1,32 +0,0 @@
From a4d82940ff85a7e307953dfa715f65d5ab487e10 Mon Sep 17 00:00:00 2001
From: Hauke Mehrtens <hauke@hauke-m.de>
Date: Sun, 18 Apr 2021 23:46:14 +0200
Subject: dt-bindings: mtd: brcm,trx: Add brcm,trx-magic
This adds the description of an additional property which allows to
specify a custom partition parser magic to detect a trx partition.
Buffalo has multiple device which are using the trx format, but with
different magic values.
Signed-off-by: Hauke Mehrtens <hauke@hauke-m.de>
Acked-by: Rob Herring <robh@kernel.org>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lore.kernel.org/linux-mtd/20210418214616.239574-2-hauke@hauke-m.de
---
.../devicetree/bindings/mtd/partitions/brcm,trx.txt | 5 +++++
1 file changed, 5 insertions(+)
--- a/Documentation/devicetree/bindings/mtd/partitions/brcm,trx.txt
+++ b/Documentation/devicetree/bindings/mtd/partitions/brcm,trx.txt
@@ -28,6 +28,11 @@ detected by a software parsing TRX heade
Required properties:
- compatible : (required) must be "brcm,trx"
+Optional properties:
+
+- brcm,trx-magic: TRX magic, if it is different from the default magic
+ 0x30524448 as a u32.
+
Example:
flash@0 {

View File

@ -1,50 +0,0 @@
From d7f7e04f8b67571a4bf5a0dcd4f9da4214f5262c Mon Sep 17 00:00:00 2001
From: Hauke Mehrtens <hauke@hauke-m.de>
Date: Sun, 18 Apr 2021 23:46:15 +0200
Subject: mtd: parsers: trx: Allow to specify brcm, trx-magic in DT
Buffalo uses a different TRX magic for every device, to be able to use
this trx parser, make it possible to specify the TRX magic in device
tree. If no TRX magic is specified in device tree, the standard value
will be used. This value should only be specified if a vendor chooses to
use a non standard TRX magic.
Signed-off-by: Hauke Mehrtens <hauke@hauke-m.de>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lore.kernel.org/linux-mtd/20210418214616.239574-3-hauke@hauke-m.de
---
drivers/mtd/parsers/parser_trx.c | 9 ++++++++-
1 file changed, 8 insertions(+), 1 deletion(-)
--- a/drivers/mtd/parsers/parser_trx.c
+++ b/drivers/mtd/parsers/parser_trx.c
@@ -51,13 +51,20 @@ static int parser_trx_parse(struct mtd_i
const struct mtd_partition **pparts,
struct mtd_part_parser_data *data)
{
+ struct device_node *np = mtd_get_of_node(mtd);
struct mtd_partition *parts;
struct mtd_partition *part;
struct trx_header trx;
size_t bytes_read;
uint8_t curr_part = 0, i = 0;
+ uint32_t trx_magic = TRX_MAGIC;
int err;
+ /* Get different magic from device tree if specified */
+ err = of_property_read_u32(np, "brcm,trx-magic", &trx_magic);
+ if (err != 0 && err != -EINVAL)
+ pr_err("failed to parse \"brcm,trx-magic\" DT attribute, using default: %d\n", err);
+
parts = kcalloc(TRX_PARSER_MAX_PARTS, sizeof(struct mtd_partition),
GFP_KERNEL);
if (!parts)
@@ -70,7 +77,7 @@ static int parser_trx_parse(struct mtd_i
return err;
}
- if (trx.magic != TRX_MAGIC) {
+ if (trx.magic != trx_magic) {
kfree(parts);
return -ENOENT;
}

View File

@ -1,25 +0,0 @@
From 81bb218c829246962a6327c64eec18ddcc049936 Mon Sep 17 00:00:00 2001
From: Hauke Mehrtens <hauke@hauke-m.de>
Date: Sun, 18 Apr 2021 23:46:16 +0200
Subject: mtd: parsers: trx: Allow to use TRX parser on Mediatek SoCs
Buffalo uses the TRX partition format also on Mediatek MT7622 SoCs.
Signed-off-by: Hauke Mehrtens <hauke@hauke-m.de>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lore.kernel.org/linux-mtd/20210418214616.239574-4-hauke@hauke-m.de
---
drivers/mtd/parsers/Kconfig | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/drivers/mtd/parsers/Kconfig
+++ b/drivers/mtd/parsers/Kconfig
@@ -115,7 +115,7 @@ config MTD_AFS_PARTS
config MTD_PARSER_TRX
tristate "Parser for TRX format partitions"
- depends on MTD && (BCM47XX || ARCH_BCM_5301X || COMPILE_TEST)
+ depends on MTD && (BCM47XX || ARCH_BCM_5301X || ARCH_MEDIATEK || COMPILE_TEST)
help
TRX is a firmware format used by Broadcom on their devices. It
may contain up to 3/4 partitions (depending on the version).

View File

@ -1,25 +0,0 @@
From dcdf415b740923530dc71d89fecc8361078473f5 Mon Sep 17 00:00:00 2001
From: Rui Salvaterra <rsalvaterra@gmail.com>
Date: Mon, 5 Apr 2021 16:11:55 +0100
Subject: [PATCH] ubifs: default to zstd compression
Compared to lzo and zlib, zstd is the best all-around performer, both in terms
of speed and compression ratio. Set it as the default, if available.
Signed-off-by: Rui Salvaterra <rsalvaterra@gmail.com>
---
fs/ubifs/sb.c | 3 +++
1 file changed, 3 insertions(+)
--- a/fs/ubifs/sb.c
+++ b/fs/ubifs/sb.c
@@ -53,6 +53,9 @@
static int get_default_compressor(struct ubifs_info *c)
{
+ if (ubifs_compr_present(c, UBIFS_COMPR_ZSTD))
+ return UBIFS_COMPR_ZSTD;
+
if (ubifs_compr_present(c, UBIFS_COMPR_LZO))
return UBIFS_COMPR_LZO;

View File

@ -1,88 +0,0 @@
From: Felix Fietkau <nbd@nbd.name>
Date: Mon, 8 Feb 2021 11:34:08 -0800
Subject: [PATCH] net: extract napi poll functionality to __napi_poll()
This commit introduces a new function __napi_poll() which does the main
logic of the existing napi_poll() function, and will be called by other
functions in later commits.
This idea and implementation is done by Felix Fietkau <nbd@nbd.name> and
is proposed as part of the patch to move napi work to work_queue
context.
This commit by itself is a code restructure.
Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Wei Wang <weiwan@google.com>
Reviewed-by: Alexander Duyck <alexanderduyck@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6805,15 +6805,10 @@ void __netif_napi_del(struct napi_struct
}
EXPORT_SYMBOL(__netif_napi_del);
-static int napi_poll(struct napi_struct *n, struct list_head *repoll)
+static int __napi_poll(struct napi_struct *n, bool *repoll)
{
- void *have;
int work, weight;
- list_del_init(&n->poll_list);
-
- have = netpoll_poll_lock(n);
-
weight = n->weight;
/* This NAPI_STATE_SCHED test is for avoiding a race
@@ -6833,7 +6828,7 @@ static int napi_poll(struct napi_struct
n->poll, work, weight);
if (likely(work < weight))
- goto out_unlock;
+ return work;
/* Drivers must not modify the NAPI state if they
* consume the entire weight. In such cases this code
@@ -6842,7 +6837,7 @@ static int napi_poll(struct napi_struct
*/
if (unlikely(napi_disable_pending(n))) {
napi_complete(n);
- goto out_unlock;
+ return work;
}
if (n->gro_bitmask) {
@@ -6860,12 +6855,29 @@ static int napi_poll(struct napi_struct
if (unlikely(!list_empty(&n->poll_list))) {
pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
n->dev ? n->dev->name : "backlog");
- goto out_unlock;
+ return work;
}
- list_add_tail(&n->poll_list, repoll);
+ *repoll = true;
+
+ return work;
+}
+
+static int napi_poll(struct napi_struct *n, struct list_head *repoll)
+{
+ bool do_repoll = false;
+ void *have;
+ int work;
+
+ list_del_init(&n->poll_list);
+
+ have = netpoll_poll_lock(n);
+
+ work = __napi_poll(n, &do_repoll);
+
+ if (do_repoll)
+ list_add_tail(&n->poll_list, repoll);
-out_unlock:
netpoll_poll_unlock(have);
return work;

View File

@ -1,261 +0,0 @@
From: Wei Wang <weiwan@google.com>
Date: Mon, 8 Feb 2021 11:34:09 -0800
Subject: [PATCH] net: implement threaded-able napi poll loop support
This patch allows running each napi poll loop inside its own
kernel thread.
The kthread is created during netif_napi_add() if dev->threaded
is set. And threaded mode is enabled in napi_enable(). We will
provide a way to set dev->threaded and enable threaded mode
without a device up/down in the following patch.
Once that threaded mode is enabled and the kthread is
started, napi_schedule() will wake-up such thread instead
of scheduling the softirq.
The threaded poll loop behaves quite likely the net_rx_action,
but it does not have to manipulate local irqs and uses
an explicit scheduling point based on netdev_budget.
Co-developed-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Co-developed-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Co-developed-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Wei Wang <weiwan@google.com>
Reviewed-by: Alexander Duyck <alexanderduyck@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -347,6 +347,7 @@ struct napi_struct {
struct list_head dev_list;
struct hlist_node napi_hash_node;
unsigned int napi_id;
+ struct task_struct *thread;
};
enum {
@@ -357,6 +358,7 @@ enum {
NAPI_STATE_LISTED, /* NAPI added to system lists */
NAPI_STATE_NO_BUSY_POLL,/* Do not add in napi_hash, no busy polling */
NAPI_STATE_IN_BUSY_POLL,/* sk_busy_loop() owns this NAPI */
+ NAPI_STATE_THREADED, /* The poll is performed inside its own thread*/
};
enum {
@@ -367,6 +369,7 @@ enum {
NAPIF_STATE_LISTED = BIT(NAPI_STATE_LISTED),
NAPIF_STATE_NO_BUSY_POLL = BIT(NAPI_STATE_NO_BUSY_POLL),
NAPIF_STATE_IN_BUSY_POLL = BIT(NAPI_STATE_IN_BUSY_POLL),
+ NAPIF_STATE_THREADED = BIT(NAPI_STATE_THREADED),
};
enum gro_result {
@@ -497,20 +500,7 @@ static inline bool napi_complete(struct
*/
void napi_disable(struct napi_struct *n);
-/**
- * napi_enable - enable NAPI scheduling
- * @n: NAPI context
- *
- * Resume NAPI from being scheduled on this context.
- * Must be paired with napi_disable.
- */
-static inline void napi_enable(struct napi_struct *n)
-{
- BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
- smp_mb__before_atomic();
- clear_bit(NAPI_STATE_SCHED, &n->state);
- clear_bit(NAPI_STATE_NPSVC, &n->state);
-}
+void napi_enable(struct napi_struct *n);
/**
* napi_synchronize - wait until NAPI is not running
@@ -1842,6 +1832,8 @@ enum netdev_ml_priv_type {
*
* @wol_enabled: Wake-on-LAN is enabled
*
+ * @threaded: napi threaded mode is enabled
+ *
* @net_notifier_list: List of per-net netdev notifier block
* that follow this device when it is moved
* to another network namespace.
@@ -2161,6 +2153,7 @@ struct net_device {
struct lock_class_key *qdisc_running_key;
bool proto_down;
unsigned wol_enabled:1;
+ unsigned threaded:1;
struct list_head net_notifier_list;
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -91,6 +91,7 @@
#include <linux/etherdevice.h>
#include <linux/ethtool.h>
#include <linux/skbuff.h>
+#include <linux/kthread.h>
#include <linux/bpf.h>
#include <linux/bpf_trace.h>
#include <net/net_namespace.h>
@@ -1500,6 +1501,27 @@ void netdev_notify_peers(struct net_devi
}
EXPORT_SYMBOL(netdev_notify_peers);
+static int napi_threaded_poll(void *data);
+
+static int napi_kthread_create(struct napi_struct *n)
+{
+ int err = 0;
+
+ /* Create and wake up the kthread once to put it in
+ * TASK_INTERRUPTIBLE mode to avoid the blocked task
+ * warning and work with loadavg.
+ */
+ n->thread = kthread_run(napi_threaded_poll, n, "napi/%s-%d",
+ n->dev->name, n->napi_id);
+ if (IS_ERR(n->thread)) {
+ err = PTR_ERR(n->thread);
+ pr_err("kthread_run failed with err %d\n", err);
+ n->thread = NULL;
+ }
+
+ return err;
+}
+
static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
{
const struct net_device_ops *ops = dev->netdev_ops;
@@ -4267,6 +4289,21 @@ int gro_normal_batch __read_mostly = 8;
static inline void ____napi_schedule(struct softnet_data *sd,
struct napi_struct *napi)
{
+ struct task_struct *thread;
+
+ if (test_bit(NAPI_STATE_THREADED, &napi->state)) {
+ /* Paired with smp_mb__before_atomic() in
+ * napi_enable(). Use READ_ONCE() to guarantee
+ * a complete read on napi->thread. Only call
+ * wake_up_process() when it's not NULL.
+ */
+ thread = READ_ONCE(napi->thread);
+ if (thread) {
+ wake_up_process(thread);
+ return;
+ }
+ }
+
list_add_tail(&napi->poll_list, &sd->poll_list);
__raise_softirq_irqoff(NET_RX_SOFTIRQ);
}
@@ -6758,6 +6795,12 @@ void netif_napi_add(struct net_device *d
set_bit(NAPI_STATE_NPSVC, &napi->state);
list_add_rcu(&napi->dev_list, &dev->napi_list);
napi_hash_add(napi);
+ /* Create kthread for this napi if dev->threaded is set.
+ * Clear dev->threaded if kthread creation failed so that
+ * threaded mode will not be enabled in napi_enable().
+ */
+ if (dev->threaded && napi_kthread_create(napi))
+ dev->threaded = 0;
}
EXPORT_SYMBOL(netif_napi_add);
@@ -6774,9 +6817,28 @@ void napi_disable(struct napi_struct *n)
hrtimer_cancel(&n->timer);
clear_bit(NAPI_STATE_DISABLE, &n->state);
+ clear_bit(NAPI_STATE_THREADED, &n->state);
}
EXPORT_SYMBOL(napi_disable);
+/**
+ * napi_enable - enable NAPI scheduling
+ * @n: NAPI context
+ *
+ * Resume NAPI from being scheduled on this context.
+ * Must be paired with napi_disable.
+ */
+void napi_enable(struct napi_struct *n)
+{
+ BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
+ smp_mb__before_atomic();
+ clear_bit(NAPI_STATE_SCHED, &n->state);
+ clear_bit(NAPI_STATE_NPSVC, &n->state);
+ if (n->dev->threaded && n->thread)
+ set_bit(NAPI_STATE_THREADED, &n->state);
+}
+EXPORT_SYMBOL(napi_enable);
+
static void flush_gro_hash(struct napi_struct *napi)
{
int i;
@@ -6802,6 +6864,11 @@ void __netif_napi_del(struct napi_struct
flush_gro_hash(napi);
napi->gro_bitmask = 0;
+
+ if (napi->thread) {
+ kthread_stop(napi->thread);
+ napi->thread = NULL;
+ }
}
EXPORT_SYMBOL(__netif_napi_del);
@@ -6883,6 +6950,51 @@ static int napi_poll(struct napi_struct
return work;
}
+static int napi_thread_wait(struct napi_struct *napi)
+{
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ while (!kthread_should_stop() && !napi_disable_pending(napi)) {
+ if (test_bit(NAPI_STATE_SCHED, &napi->state)) {
+ WARN_ON(!list_empty(&napi->poll_list));
+ __set_current_state(TASK_RUNNING);
+ return 0;
+ }
+
+ schedule();
+ set_current_state(TASK_INTERRUPTIBLE);
+ }
+ __set_current_state(TASK_RUNNING);
+ return -1;
+}
+
+static int napi_threaded_poll(void *data)
+{
+ struct napi_struct *napi = data;
+ void *have;
+
+ while (!napi_thread_wait(napi)) {
+ for (;;) {
+ bool repoll = false;
+
+ local_bh_disable();
+
+ have = netpoll_poll_lock(napi);
+ __napi_poll(napi, &repoll);
+ netpoll_poll_unlock(have);
+
+ __kfree_skb_flush();
+ local_bh_enable();
+
+ if (!repoll)
+ break;
+
+ cond_resched();
+ }
+ }
+ return 0;
+}
+
static __latent_entropy void net_rx_action(struct softirq_action *h)
{
struct softnet_data *sd = this_cpu_ptr(&softnet_data);

View File

@ -1,177 +0,0 @@
From: Wei Wang <weiwan@google.com>
Date: Mon, 8 Feb 2021 11:34:10 -0800
Subject: [PATCH] net: add sysfs attribute to control napi threaded mode
This patch adds a new sysfs attribute to the network device class.
Said attribute provides a per-device control to enable/disable the
threaded mode for all the napi instances of the given network device,
without the need for a device up/down.
User sets it to 1 or 0 to enable or disable threaded mode.
Note: when switching between threaded and the current softirq based mode
for a napi instance, it will not immediately take effect if the napi is
currently being polled. The mode switch will happen for the next time
napi_schedule() is called.
Co-developed-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Co-developed-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Co-developed-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Wei Wang <weiwan@google.com>
Reviewed-by: Alexander Duyck <alexanderduyck@fb.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
--- a/Documentation/ABI/testing/sysfs-class-net
+++ b/Documentation/ABI/testing/sysfs-class-net
@@ -337,3 +337,18 @@ Contact: netdev@vger.kernel.org
Description:
32-bit unsigned integer counting the number of times the link has
been down
+
+What: /sys/class/net/<iface>/threaded
+Date: Jan 2021
+KernelVersion: 5.12
+Contact: netdev@vger.kernel.org
+Description:
+ Boolean value to control the threaded mode per device. User could
+ set this value to enable/disable threaded mode for all napi
+ belonging to this device, without the need to do device up/down.
+
+ Possible values:
+ == ==================================
+ 0 threaded mode disabled for this dev
+ 1 threaded mode enabled for this dev
+ == ==================================
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -491,6 +491,8 @@ static inline bool napi_complete(struct
return napi_complete_done(n, 0);
}
+int dev_set_threaded(struct net_device *dev, bool threaded);
+
/**
* napi_disable - prevent NAPI from scheduling
* @n: NAPI context
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4293,8 +4293,9 @@ static inline void ____napi_schedule(str
if (test_bit(NAPI_STATE_THREADED, &napi->state)) {
/* Paired with smp_mb__before_atomic() in
- * napi_enable(). Use READ_ONCE() to guarantee
- * a complete read on napi->thread. Only call
+ * napi_enable()/dev_set_threaded().
+ * Use READ_ONCE() to guarantee a complete
+ * read on napi->thread. Only call
* wake_up_process() when it's not NULL.
*/
thread = READ_ONCE(napi->thread);
@@ -6768,6 +6769,49 @@ static void init_gro_hash(struct napi_st
napi->gro_bitmask = 0;
}
+int dev_set_threaded(struct net_device *dev, bool threaded)
+{
+ struct napi_struct *napi;
+ int err = 0;
+
+ if (dev->threaded == threaded)
+ return 0;
+
+ if (threaded) {
+ list_for_each_entry(napi, &dev->napi_list, dev_list) {
+ if (!napi->thread) {
+ err = napi_kthread_create(napi);
+ if (err) {
+ threaded = false;
+ break;
+ }
+ }
+ }
+ }
+
+ dev->threaded = threaded;
+
+ /* Make sure kthread is created before THREADED bit
+ * is set.
+ */
+ smp_mb__before_atomic();
+
+ /* Setting/unsetting threaded mode on a napi might not immediately
+ * take effect, if the current napi instance is actively being
+ * polled. In this case, the switch between threaded mode and
+ * softirq mode will happen in the next round of napi_schedule().
+ * This should not cause hiccups/stalls to the live traffic.
+ */
+ list_for_each_entry(napi, &dev->napi_list, dev_list) {
+ if (threaded)
+ set_bit(NAPI_STATE_THREADED, &napi->state);
+ else
+ clear_bit(NAPI_STATE_THREADED, &napi->state);
+ }
+
+ return err;
+}
+
void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
int (*poll)(struct napi_struct *, int), int weight)
{
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -587,6 +587,45 @@ static ssize_t phys_switch_id_show(struc
}
static DEVICE_ATTR_RO(phys_switch_id);
+static ssize_t threaded_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct net_device *netdev = to_net_dev(dev);
+ ssize_t ret = -EINVAL;
+
+ if (!rtnl_trylock())
+ return restart_syscall();
+
+ if (dev_isalive(netdev))
+ ret = sprintf(buf, fmt_dec, netdev->threaded);
+
+ rtnl_unlock();
+ return ret;
+}
+
+static int modify_napi_threaded(struct net_device *dev, unsigned long val)
+{
+ int ret;
+
+ if (list_empty(&dev->napi_list))
+ return -EOPNOTSUPP;
+
+ if (val != 0 && val != 1)
+ return -EOPNOTSUPP;
+
+ ret = dev_set_threaded(dev, val);
+
+ return ret;
+}
+
+static ssize_t threaded_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ return netdev_store(dev, attr, buf, len, modify_napi_threaded);
+}
+static DEVICE_ATTR_RW(threaded);
+
static struct attribute *net_class_attrs[] __ro_after_init = {
&dev_attr_netdev_group.attr,
&dev_attr_type.attr,
@@ -619,6 +658,7 @@ static struct attribute *net_class_attrs
&dev_attr_proto_down.attr,
&dev_attr_carrier_up_count.attr,
&dev_attr_carrier_down_count.attr,
+ &dev_attr_threaded.attr,
NULL,
};
ATTRIBUTE_GROUPS(net_class);

View File

@ -1,93 +0,0 @@
From: Wei Wang <weiwan@google.com>
Date: Mon, 1 Mar 2021 17:21:13 -0800
Subject: [PATCH] net: fix race between napi kthread mode and busy poll
Currently, napi_thread_wait() checks for NAPI_STATE_SCHED bit to
determine if the kthread owns this napi and could call napi->poll() on
it. However, if socket busy poll is enabled, it is possible that the
busy poll thread grabs this SCHED bit (after the previous napi->poll()
invokes napi_complete_done() and clears SCHED bit) and tries to poll
on the same napi. napi_disable() could grab the SCHED bit as well.
This patch tries to fix this race by adding a new bit
NAPI_STATE_SCHED_THREADED in napi->state. This bit gets set in
____napi_schedule() if the threaded mode is enabled, and gets cleared
in napi_complete_done(), and we only poll the napi in kthread if this
bit is set. This helps distinguish the ownership of the napi between
kthread and other scenarios and fixes the race issue.
Fixes: 29863d41bb6e ("net: implement threaded-able napi poll loop support")
Reported-by: Martin Zaharinov <micron10@gmail.com>
Suggested-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Wei Wang <weiwan@google.com>
Cc: Alexander Duyck <alexanderduyck@fb.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Paolo Abeni <pabeni@redhat.com>
Cc: Hannes Frederic Sowa <hannes@stressinduktion.org>
---
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -359,6 +359,7 @@ enum {
NAPI_STATE_NO_BUSY_POLL,/* Do not add in napi_hash, no busy polling */
NAPI_STATE_IN_BUSY_POLL,/* sk_busy_loop() owns this NAPI */
NAPI_STATE_THREADED, /* The poll is performed inside its own thread*/
+ NAPI_STATE_SCHED_THREADED, /* Napi is currently scheduled in threaded mode */
};
enum {
@@ -370,6 +371,7 @@ enum {
NAPIF_STATE_NO_BUSY_POLL = BIT(NAPI_STATE_NO_BUSY_POLL),
NAPIF_STATE_IN_BUSY_POLL = BIT(NAPI_STATE_IN_BUSY_POLL),
NAPIF_STATE_THREADED = BIT(NAPI_STATE_THREADED),
+ NAPIF_STATE_SCHED_THREADED = BIT(NAPI_STATE_SCHED_THREADED),
};
enum gro_result {
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4300,6 +4300,8 @@ static inline void ____napi_schedule(str
*/
thread = READ_ONCE(napi->thread);
if (thread) {
+ if (thread->state != TASK_INTERRUPTIBLE)
+ set_bit(NAPI_STATE_SCHED_THREADED, &napi->state);
wake_up_process(thread);
return;
}
@@ -6560,7 +6562,8 @@ bool napi_complete_done(struct napi_stru
WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED));
- new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED);
+ new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED |
+ NAPIF_STATE_SCHED_THREADED);
/* If STATE_MISSED was set, leave STATE_SCHED set,
* because we will call napi->poll() one more time.
@@ -6996,16 +6999,25 @@ static int napi_poll(struct napi_struct
static int napi_thread_wait(struct napi_struct *napi)
{
+ bool woken = false;
+
set_current_state(TASK_INTERRUPTIBLE);
while (!kthread_should_stop() && !napi_disable_pending(napi)) {
- if (test_bit(NAPI_STATE_SCHED, &napi->state)) {
+ /* Testing SCHED_THREADED bit here to make sure the current
+ * kthread owns this napi and could poll on this napi.
+ * Testing SCHED bit is not enough because SCHED bit might be
+ * set by some other busy poll thread or by napi_disable().
+ */
+ if (test_bit(NAPI_STATE_SCHED_THREADED, &napi->state) || woken) {
WARN_ON(!list_empty(&napi->poll_list));
__set_current_state(TASK_RUNNING);
return 0;
}
schedule();
+ /* woken being true indicates this thread owns this napi. */
+ woken = true;
set_current_state(TASK_INTERRUPTIBLE);
}
__set_current_state(TASK_RUNNING);

View File

@ -1,53 +0,0 @@
From: Paolo Abeni <pabeni@redhat.com>
Date: Fri, 9 Apr 2021 17:24:17 +0200
Subject: [PATCH] net: fix hangup on napi_disable for threaded napi
napi_disable() is subject to an hangup, when the threaded
mode is enabled and the napi is under heavy traffic.
If the relevant napi has been scheduled and the napi_disable()
kicks in before the next napi_threaded_wait() completes - so
that the latter quits due to the napi_disable_pending() condition,
the existing code leaves the NAPI_STATE_SCHED bit set and the
napi_disable() loop waiting for such bit will hang.
This patch addresses the issue by dropping the NAPI_STATE_DISABLE
bit test in napi_thread_wait(). The later napi_threaded_poll()
iteration will take care of clearing the NAPI_STATE_SCHED.
This also addresses a related problem reported by Jakub:
before this patch a napi_disable()/napi_enable() pair killed
the napi thread, effectively disabling the threaded mode.
On the patched kernel napi_disable() simply stops scheduling
the relevant thread.
v1 -> v2:
- let the main napi_thread_poll() loop clear the SCHED bit
Reported-by: Jakub Kicinski <kuba@kernel.org>
Fixes: 29863d41bb6e ("net: implement threaded-able napi poll loop support")
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://lore.kernel.org/r/883923fa22745a9589e8610962b7dc59df09fb1f.1617981844.git.pabeni@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -7003,7 +7003,7 @@ static int napi_thread_wait(struct napi_
set_current_state(TASK_INTERRUPTIBLE);
- while (!kthread_should_stop() && !napi_disable_pending(napi)) {
+ while (!kthread_should_stop()) {
/* Testing SCHED_THREADED bit here to make sure the current
* kthread owns this napi and could poll on this napi.
* Testing SCHED bit is not enough because SCHED bit might be
@@ -7021,6 +7021,7 @@ static int napi_thread_wait(struct napi_
set_current_state(TASK_INTERRUPTIBLE);
}
__set_current_state(TASK_RUNNING);
+
return -1;
}

View File

@ -1,52 +0,0 @@
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Fri, 20 Nov 2020 13:49:13 +0100
Subject: [PATCH] netfilter: flowtable: add hash offset field to tuple
Add a placeholder field to calculate hash tuple offset. Similar to
2c407aca6497 ("netfilter: conntrack: avoid gcc-10 zero-length-bounds
warning").
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
--- a/include/net/netfilter/nf_flow_table.h
+++ b/include/net/netfilter/nf_flow_table.h
@@ -107,6 +107,10 @@ struct flow_offload_tuple {
u8 l3proto;
u8 l4proto;
+
+ /* All members above are keys for lookups, see flow_offload_hash(). */
+ struct { } __hash;
+
u8 dir;
u16 mtu;
--- a/net/netfilter/nf_flow_table_core.c
+++ b/net/netfilter/nf_flow_table_core.c
@@ -191,14 +191,14 @@ static u32 flow_offload_hash(const void
{
const struct flow_offload_tuple *tuple = data;
- return jhash(tuple, offsetof(struct flow_offload_tuple, dir), seed);
+ return jhash(tuple, offsetof(struct flow_offload_tuple, __hash), seed);
}
static u32 flow_offload_hash_obj(const void *data, u32 len, u32 seed)
{
const struct flow_offload_tuple_rhash *tuplehash = data;
- return jhash(&tuplehash->tuple, offsetof(struct flow_offload_tuple, dir), seed);
+ return jhash(&tuplehash->tuple, offsetof(struct flow_offload_tuple, __hash), seed);
}
static int flow_offload_hash_cmp(struct rhashtable_compare_arg *arg,
@@ -207,7 +207,7 @@ static int flow_offload_hash_cmp(struct
const struct flow_offload_tuple *tuple = arg->key;
const struct flow_offload_tuple_rhash *x = ptr;
- if (memcmp(&x->tuple, tuple, offsetof(struct flow_offload_tuple, dir)))
+ if (memcmp(&x->tuple, tuple, offsetof(struct flow_offload_tuple, __hash)))
return 1;
return 0;

View File

@ -1,98 +0,0 @@
From: Oz Shlomo <ozsh@nvidia.com>
Date: Tue, 23 Mar 2021 00:56:19 +0100
Subject: [PATCH] netfilter: flowtable: separate replace, destroy and
stats to different workqueues
Currently the flow table offload replace, destroy and stats work items are
executed on a single workqueue. As such, DESTROY and STATS commands may
be backloged after a burst of REPLACE work items. This scenario can bloat
up memory and may cause active connections to age.
Instatiate add, del and stats workqueues to avoid backlogs of non-dependent
actions. Provide sysfs control over the workqueue attributes, allowing
userspace applications to control the workqueue cpumask.
Signed-off-by: Oz Shlomo <ozsh@nvidia.com>
Reviewed-by: Paul Blakey <paulb@nvidia.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
--- a/net/netfilter/nf_flow_table_offload.c
+++ b/net/netfilter/nf_flow_table_offload.c
@@ -13,7 +13,9 @@
#include <net/netfilter/nf_conntrack_core.h>
#include <net/netfilter/nf_conntrack_tuple.h>
-static struct workqueue_struct *nf_flow_offload_wq;
+static struct workqueue_struct *nf_flow_offload_add_wq;
+static struct workqueue_struct *nf_flow_offload_del_wq;
+static struct workqueue_struct *nf_flow_offload_stats_wq;
struct flow_offload_work {
struct list_head list;
@@ -827,7 +829,12 @@ static void flow_offload_work_handler(st
static void flow_offload_queue_work(struct flow_offload_work *offload)
{
- queue_work(nf_flow_offload_wq, &offload->work);
+ if (offload->cmd == FLOW_CLS_REPLACE)
+ queue_work(nf_flow_offload_add_wq, &offload->work);
+ else if (offload->cmd == FLOW_CLS_DESTROY)
+ queue_work(nf_flow_offload_del_wq, &offload->work);
+ else
+ queue_work(nf_flow_offload_stats_wq, &offload->work);
}
static struct flow_offload_work *
@@ -899,8 +906,11 @@ void nf_flow_offload_stats(struct nf_flo
void nf_flow_table_offload_flush(struct nf_flowtable *flowtable)
{
- if (nf_flowtable_hw_offload(flowtable))
- flush_workqueue(nf_flow_offload_wq);
+ if (nf_flowtable_hw_offload(flowtable)) {
+ flush_workqueue(nf_flow_offload_add_wq);
+ flush_workqueue(nf_flow_offload_del_wq);
+ flush_workqueue(nf_flow_offload_stats_wq);
+ }
}
static int nf_flow_table_block_setup(struct nf_flowtable *flowtable,
@@ -1013,15 +1023,33 @@ EXPORT_SYMBOL_GPL(nf_flow_table_offload_
int nf_flow_table_offload_init(void)
{
- nf_flow_offload_wq = alloc_workqueue("nf_flow_table_offload",
- WQ_UNBOUND, 0);
- if (!nf_flow_offload_wq)
+ nf_flow_offload_add_wq = alloc_workqueue("nf_ft_offload_add",
+ WQ_UNBOUND | WQ_SYSFS, 0);
+ if (!nf_flow_offload_add_wq)
return -ENOMEM;
+ nf_flow_offload_del_wq = alloc_workqueue("nf_ft_offload_del",
+ WQ_UNBOUND | WQ_SYSFS, 0);
+ if (!nf_flow_offload_del_wq)
+ goto err_del_wq;
+
+ nf_flow_offload_stats_wq = alloc_workqueue("nf_ft_offload_stats",
+ WQ_UNBOUND | WQ_SYSFS, 0);
+ if (!nf_flow_offload_stats_wq)
+ goto err_stats_wq;
+
return 0;
+
+err_stats_wq:
+ destroy_workqueue(nf_flow_offload_del_wq);
+err_del_wq:
+ destroy_workqueue(nf_flow_offload_add_wq);
+ return -ENOMEM;
}
void nf_flow_table_offload_exit(void)
{
- destroy_workqueue(nf_flow_offload_wq);
+ destroy_workqueue(nf_flow_offload_add_wq);
+ destroy_workqueue(nf_flow_offload_del_wq);
+ destroy_workqueue(nf_flow_offload_stats_wq);
}

View File

@ -1,22 +0,0 @@
From: YueHaibing <yuehaibing@huawei.com>
Date: Tue, 23 Mar 2021 00:56:21 +0100
Subject: [PATCH] netfilter: conntrack: Remove unused variable
declaration
commit e97c3e278e95 ("tproxy: split off ipv6 defragmentation to a separate
module") left behind this.
Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
--- a/include/net/netfilter/ipv6/nf_conntrack_ipv6.h
+++ b/include/net/netfilter/ipv6/nf_conntrack_ipv6.h
@@ -4,7 +4,4 @@
extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmpv6;
-#include <linux/sysctl.h>
-extern struct ctl_table nf_ct_ipv6_sysctl_table[];
-
#endif /* _NF_CONNTRACK_IPV6_H*/

View File

@ -1,291 +0,0 @@
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 23 Mar 2021 00:56:22 +0100
Subject: [PATCH] netfilter: flowtable: consolidate
skb_try_make_writable() call
Fetch the layer 4 header size to be mangled by NAT when building the
tuple, then use it to make writable the network and the transport
headers. After this update, the NAT routines now assumes that the skbuff
area is writable. Do the pointer refetch only after the single
skb_try_make_writable() call.
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
--- a/net/netfilter/nf_flow_table_core.c
+++ b/net/netfilter/nf_flow_table_core.c
@@ -394,9 +394,6 @@ static int nf_flow_nat_port_tcp(struct s
{
struct tcphdr *tcph;
- if (skb_try_make_writable(skb, thoff + sizeof(*tcph)))
- return -1;
-
tcph = (void *)(skb_network_header(skb) + thoff);
inet_proto_csum_replace2(&tcph->check, skb, port, new_port, false);
@@ -408,9 +405,6 @@ static int nf_flow_nat_port_udp(struct s
{
struct udphdr *udph;
- if (skb_try_make_writable(skb, thoff + sizeof(*udph)))
- return -1;
-
udph = (void *)(skb_network_header(skb) + thoff);
if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) {
inet_proto_csum_replace2(&udph->check, skb, port,
@@ -446,9 +440,6 @@ int nf_flow_snat_port(const struct flow_
struct flow_ports *hdr;
__be16 port, new_port;
- if (skb_try_make_writable(skb, thoff + sizeof(*hdr)))
- return -1;
-
hdr = (void *)(skb_network_header(skb) + thoff);
switch (dir) {
@@ -477,9 +468,6 @@ int nf_flow_dnat_port(const struct flow_
struct flow_ports *hdr;
__be16 port, new_port;
- if (skb_try_make_writable(skb, thoff + sizeof(*hdr)))
- return -1;
-
hdr = (void *)(skb_network_header(skb) + thoff);
switch (dir) {
--- a/net/netfilter/nf_flow_table_ip.c
+++ b/net/netfilter/nf_flow_table_ip.c
@@ -39,9 +39,6 @@ static int nf_flow_nat_ip_tcp(struct sk_
{
struct tcphdr *tcph;
- if (skb_try_make_writable(skb, thoff + sizeof(*tcph)))
- return -1;
-
tcph = (void *)(skb_network_header(skb) + thoff);
inet_proto_csum_replace4(&tcph->check, skb, addr, new_addr, true);
@@ -53,9 +50,6 @@ static int nf_flow_nat_ip_udp(struct sk_
{
struct udphdr *udph;
- if (skb_try_make_writable(skb, thoff + sizeof(*udph)))
- return -1;
-
udph = (void *)(skb_network_header(skb) + thoff);
if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) {
inet_proto_csum_replace4(&udph->check, skb, addr,
@@ -136,19 +130,17 @@ static int nf_flow_dnat_ip(const struct
}
static int nf_flow_nat_ip(const struct flow_offload *flow, struct sk_buff *skb,
- unsigned int thoff, enum flow_offload_tuple_dir dir)
+ unsigned int thoff, enum flow_offload_tuple_dir dir,
+ struct iphdr *iph)
{
- struct iphdr *iph = ip_hdr(skb);
-
if (test_bit(NF_FLOW_SNAT, &flow->flags) &&
(nf_flow_snat_port(flow, skb, thoff, iph->protocol, dir) < 0 ||
- nf_flow_snat_ip(flow, skb, ip_hdr(skb), thoff, dir) < 0))
+ nf_flow_snat_ip(flow, skb, iph, thoff, dir) < 0))
return -1;
- iph = ip_hdr(skb);
if (test_bit(NF_FLOW_DNAT, &flow->flags) &&
(nf_flow_dnat_port(flow, skb, thoff, iph->protocol, dir) < 0 ||
- nf_flow_dnat_ip(flow, skb, ip_hdr(skb), thoff, dir) < 0))
+ nf_flow_dnat_ip(flow, skb, iph, thoff, dir) < 0))
return -1;
return 0;
@@ -160,10 +152,10 @@ static bool ip_has_options(unsigned int
}
static int nf_flow_tuple_ip(struct sk_buff *skb, const struct net_device *dev,
- struct flow_offload_tuple *tuple)
+ struct flow_offload_tuple *tuple, u32 *hdrsize)
{
- unsigned int thoff, hdrsize;
struct flow_ports *ports;
+ unsigned int thoff;
struct iphdr *iph;
if (!pskb_may_pull(skb, sizeof(*iph)))
@@ -178,10 +170,10 @@ static int nf_flow_tuple_ip(struct sk_bu
switch (iph->protocol) {
case IPPROTO_TCP:
- hdrsize = sizeof(struct tcphdr);
+ *hdrsize = sizeof(struct tcphdr);
break;
case IPPROTO_UDP:
- hdrsize = sizeof(struct udphdr);
+ *hdrsize = sizeof(struct udphdr);
break;
default:
return -1;
@@ -191,7 +183,7 @@ static int nf_flow_tuple_ip(struct sk_bu
return -1;
thoff = iph->ihl * 4;
- if (!pskb_may_pull(skb, thoff + hdrsize))
+ if (!pskb_may_pull(skb, thoff + *hdrsize))
return -1;
iph = ip_hdr(skb);
@@ -252,11 +244,12 @@ nf_flow_offload_ip_hook(void *priv, stru
unsigned int thoff;
struct iphdr *iph;
__be32 nexthop;
+ u32 hdrsize;
if (skb->protocol != htons(ETH_P_IP))
return NF_ACCEPT;
- if (nf_flow_tuple_ip(skb, state->in, &tuple) < 0)
+ if (nf_flow_tuple_ip(skb, state->in, &tuple, &hdrsize) < 0)
return NF_ACCEPT;
tuplehash = flow_offload_lookup(flow_table, &tuple);
@@ -271,11 +264,13 @@ nf_flow_offload_ip_hook(void *priv, stru
if (unlikely(nf_flow_exceeds_mtu(skb, flow->tuplehash[dir].tuple.mtu)))
return NF_ACCEPT;
- if (skb_try_make_writable(skb, sizeof(*iph)))
+ iph = ip_hdr(skb);
+ thoff = iph->ihl * 4;
+ if (skb_try_make_writable(skb, thoff + hdrsize))
return NF_DROP;
- thoff = ip_hdr(skb)->ihl * 4;
- if (nf_flow_state_check(flow, ip_hdr(skb)->protocol, skb, thoff))
+ iph = ip_hdr(skb);
+ if (nf_flow_state_check(flow, iph->protocol, skb, thoff))
return NF_ACCEPT;
flow_offload_refresh(flow_table, flow);
@@ -285,10 +280,9 @@ nf_flow_offload_ip_hook(void *priv, stru
return NF_ACCEPT;
}
- if (nf_flow_nat_ip(flow, skb, thoff, dir) < 0)
+ if (nf_flow_nat_ip(flow, skb, thoff, dir, iph) < 0)
return NF_DROP;
- iph = ip_hdr(skb);
ip_decrease_ttl(iph);
skb->tstamp = 0;
@@ -317,9 +311,6 @@ static int nf_flow_nat_ipv6_tcp(struct s
{
struct tcphdr *tcph;
- if (skb_try_make_writable(skb, thoff + sizeof(*tcph)))
- return -1;
-
tcph = (void *)(skb_network_header(skb) + thoff);
inet_proto_csum_replace16(&tcph->check, skb, addr->s6_addr32,
new_addr->s6_addr32, true);
@@ -333,9 +324,6 @@ static int nf_flow_nat_ipv6_udp(struct s
{
struct udphdr *udph;
- if (skb_try_make_writable(skb, thoff + sizeof(*udph)))
- return -1;
-
udph = (void *)(skb_network_header(skb) + thoff);
if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) {
inet_proto_csum_replace16(&udph->check, skb, addr->s6_addr32,
@@ -417,31 +405,30 @@ static int nf_flow_dnat_ipv6(const struc
static int nf_flow_nat_ipv6(const struct flow_offload *flow,
struct sk_buff *skb,
- enum flow_offload_tuple_dir dir)
+ enum flow_offload_tuple_dir dir,
+ struct ipv6hdr *ip6h)
{
- struct ipv6hdr *ip6h = ipv6_hdr(skb);
unsigned int thoff = sizeof(*ip6h);
if (test_bit(NF_FLOW_SNAT, &flow->flags) &&
(nf_flow_snat_port(flow, skb, thoff, ip6h->nexthdr, dir) < 0 ||
- nf_flow_snat_ipv6(flow, skb, ipv6_hdr(skb), thoff, dir) < 0))
+ nf_flow_snat_ipv6(flow, skb, ip6h, thoff, dir) < 0))
return -1;
- ip6h = ipv6_hdr(skb);
if (test_bit(NF_FLOW_DNAT, &flow->flags) &&
(nf_flow_dnat_port(flow, skb, thoff, ip6h->nexthdr, dir) < 0 ||
- nf_flow_dnat_ipv6(flow, skb, ipv6_hdr(skb), thoff, dir) < 0))
+ nf_flow_dnat_ipv6(flow, skb, ip6h, thoff, dir) < 0))
return -1;
return 0;
}
static int nf_flow_tuple_ipv6(struct sk_buff *skb, const struct net_device *dev,
- struct flow_offload_tuple *tuple)
+ struct flow_offload_tuple *tuple, u32 *hdrsize)
{
- unsigned int thoff, hdrsize;
struct flow_ports *ports;
struct ipv6hdr *ip6h;
+ unsigned int thoff;
if (!pskb_may_pull(skb, sizeof(*ip6h)))
return -1;
@@ -450,10 +437,10 @@ static int nf_flow_tuple_ipv6(struct sk_
switch (ip6h->nexthdr) {
case IPPROTO_TCP:
- hdrsize = sizeof(struct tcphdr);
+ *hdrsize = sizeof(struct tcphdr);
break;
case IPPROTO_UDP:
- hdrsize = sizeof(struct udphdr);
+ *hdrsize = sizeof(struct udphdr);
break;
default:
return -1;
@@ -463,7 +450,7 @@ static int nf_flow_tuple_ipv6(struct sk_
return -1;
thoff = sizeof(*ip6h);
- if (!pskb_may_pull(skb, thoff + hdrsize))
+ if (!pskb_may_pull(skb, thoff + *hdrsize))
return -1;
ip6h = ipv6_hdr(skb);
@@ -493,11 +480,12 @@ nf_flow_offload_ipv6_hook(void *priv, st
struct net_device *outdev;
struct ipv6hdr *ip6h;
struct rt6_info *rt;
+ u32 hdrsize;
if (skb->protocol != htons(ETH_P_IPV6))
return NF_ACCEPT;
- if (nf_flow_tuple_ipv6(skb, state->in, &tuple) < 0)
+ if (nf_flow_tuple_ipv6(skb, state->in, &tuple, &hdrsize) < 0)
return NF_ACCEPT;
tuplehash = flow_offload_lookup(flow_table, &tuple);
@@ -523,13 +511,13 @@ nf_flow_offload_ipv6_hook(void *priv, st
return NF_ACCEPT;
}
- if (skb_try_make_writable(skb, sizeof(*ip6h)))
+ if (skb_try_make_writable(skb, sizeof(*ip6h) + hdrsize))
return NF_DROP;
- if (nf_flow_nat_ipv6(flow, skb, dir) < 0)
+ ip6h = ipv6_hdr(skb);
+ if (nf_flow_nat_ipv6(flow, skb, dir, ip6h) < 0)
return NF_DROP;
- ip6h = ipv6_hdr(skb);
ip6h->hop_limit--;
skb->tstamp = 0;

View File

@ -1,35 +0,0 @@
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 23 Mar 2021 00:56:23 +0100
Subject: [PATCH] netfilter: flowtable: move skb_try_make_writable()
before NAT in IPv4
For consistency with the IPv6 flowtable datapath and to make sure the
skbuff is writable right before the NAT header updates.
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
--- a/net/netfilter/nf_flow_table_ip.c
+++ b/net/netfilter/nf_flow_table_ip.c
@@ -266,10 +266,6 @@ nf_flow_offload_ip_hook(void *priv, stru
iph = ip_hdr(skb);
thoff = iph->ihl * 4;
- if (skb_try_make_writable(skb, thoff + hdrsize))
- return NF_DROP;
-
- iph = ip_hdr(skb);
if (nf_flow_state_check(flow, iph->protocol, skb, thoff))
return NF_ACCEPT;
@@ -280,6 +276,10 @@ nf_flow_offload_ip_hook(void *priv, stru
return NF_ACCEPT;
}
+ if (skb_try_make_writable(skb, thoff + hdrsize))
+ return NF_DROP;
+
+ iph = ip_hdr(skb);
if (nf_flow_nat_ip(flow, skb, thoff, dir, iph) < 0)
return NF_DROP;

View File

@ -1,82 +0,0 @@
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 23 Mar 2021 00:56:24 +0100
Subject: [PATCH] netfilter: flowtable: move FLOW_OFFLOAD_DIR_MAX away
from enumeration
This allows to remove the default case which should not ever happen and
that was added to avoid gcc warnings on unhandled FLOW_OFFLOAD_DIR_MAX
enumeration case.
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
--- a/include/net/netfilter/nf_flow_table.h
+++ b/include/net/netfilter/nf_flow_table.h
@@ -86,8 +86,8 @@ static inline bool nf_flowtable_hw_offlo
enum flow_offload_tuple_dir {
FLOW_OFFLOAD_DIR_ORIGINAL = IP_CT_DIR_ORIGINAL,
FLOW_OFFLOAD_DIR_REPLY = IP_CT_DIR_REPLY,
- FLOW_OFFLOAD_DIR_MAX = IP_CT_DIR_MAX
};
+#define FLOW_OFFLOAD_DIR_MAX IP_CT_DIR_MAX
struct flow_offload_tuple {
union {
--- a/net/netfilter/nf_flow_table_core.c
+++ b/net/netfilter/nf_flow_table_core.c
@@ -453,8 +453,6 @@ int nf_flow_snat_port(const struct flow_
new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_port;
hdr->dest = new_port;
break;
- default:
- return -1;
}
return nf_flow_nat_port(skb, thoff, protocol, port, new_port);
@@ -481,8 +479,6 @@ int nf_flow_dnat_port(const struct flow_
new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_port;
hdr->source = new_port;
break;
- default:
- return -1;
}
return nf_flow_nat_port(skb, thoff, protocol, port, new_port);
--- a/net/netfilter/nf_flow_table_ip.c
+++ b/net/netfilter/nf_flow_table_ip.c
@@ -96,8 +96,6 @@ static int nf_flow_snat_ip(const struct
new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v4.s_addr;
iph->daddr = new_addr;
break;
- default:
- return -1;
}
csum_replace4(&iph->check, addr, new_addr);
@@ -121,8 +119,6 @@ static int nf_flow_dnat_ip(const struct
new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v4.s_addr;
iph->saddr = new_addr;
break;
- default:
- return -1;
}
csum_replace4(&iph->check, addr, new_addr);
@@ -371,8 +367,6 @@ static int nf_flow_snat_ipv6(const struc
new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v6;
ip6h->daddr = new_addr;
break;
- default:
- return -1;
}
return nf_flow_nat_ipv6_l4proto(skb, ip6h, thoff, &addr, &new_addr);
@@ -396,8 +390,6 @@ static int nf_flow_dnat_ipv6(const struc
new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v6;
ip6h->saddr = new_addr;
break;
- default:
- return -1;
}
return nf_flow_nat_ipv6_l4proto(skb, ip6h, thoff, &addr, &new_addr);

View File

@ -1,394 +0,0 @@
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 23 Mar 2021 00:56:25 +0100
Subject: [PATCH] netfilter: flowtable: fast NAT functions never fail
Simplify existing fast NAT routines by returning void. After the
skb_try_make_writable() call consolidation, these routines cannot ever
fail.
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
--- a/include/net/netfilter/nf_flow_table.h
+++ b/include/net/netfilter/nf_flow_table.h
@@ -228,12 +228,12 @@ void nf_flow_table_free(struct nf_flowta
void flow_offload_teardown(struct flow_offload *flow);
-int nf_flow_snat_port(const struct flow_offload *flow,
- struct sk_buff *skb, unsigned int thoff,
- u8 protocol, enum flow_offload_tuple_dir dir);
-int nf_flow_dnat_port(const struct flow_offload *flow,
- struct sk_buff *skb, unsigned int thoff,
- u8 protocol, enum flow_offload_tuple_dir dir);
+void nf_flow_snat_port(const struct flow_offload *flow,
+ struct sk_buff *skb, unsigned int thoff,
+ u8 protocol, enum flow_offload_tuple_dir dir);
+void nf_flow_dnat_port(const struct flow_offload *flow,
+ struct sk_buff *skb, unsigned int thoff,
+ u8 protocol, enum flow_offload_tuple_dir dir);
struct flow_ports {
__be16 source, dest;
--- a/net/netfilter/nf_flow_table_core.c
+++ b/net/netfilter/nf_flow_table_core.c
@@ -388,20 +388,17 @@ static void nf_flow_offload_work_gc(stru
queue_delayed_work(system_power_efficient_wq, &flow_table->gc_work, HZ);
}
-
-static int nf_flow_nat_port_tcp(struct sk_buff *skb, unsigned int thoff,
- __be16 port, __be16 new_port)
+static void nf_flow_nat_port_tcp(struct sk_buff *skb, unsigned int thoff,
+ __be16 port, __be16 new_port)
{
struct tcphdr *tcph;
tcph = (void *)(skb_network_header(skb) + thoff);
inet_proto_csum_replace2(&tcph->check, skb, port, new_port, false);
-
- return 0;
}
-static int nf_flow_nat_port_udp(struct sk_buff *skb, unsigned int thoff,
- __be16 port, __be16 new_port)
+static void nf_flow_nat_port_udp(struct sk_buff *skb, unsigned int thoff,
+ __be16 port, __be16 new_port)
{
struct udphdr *udph;
@@ -412,30 +409,24 @@ static int nf_flow_nat_port_udp(struct s
if (!udph->check)
udph->check = CSUM_MANGLED_0;
}
-
- return 0;
}
-static int nf_flow_nat_port(struct sk_buff *skb, unsigned int thoff,
- u8 protocol, __be16 port, __be16 new_port)
+static void nf_flow_nat_port(struct sk_buff *skb, unsigned int thoff,
+ u8 protocol, __be16 port, __be16 new_port)
{
switch (protocol) {
case IPPROTO_TCP:
- if (nf_flow_nat_port_tcp(skb, thoff, port, new_port) < 0)
- return NF_DROP;
+ nf_flow_nat_port_tcp(skb, thoff, port, new_port);
break;
case IPPROTO_UDP:
- if (nf_flow_nat_port_udp(skb, thoff, port, new_port) < 0)
- return NF_DROP;
+ nf_flow_nat_port_udp(skb, thoff, port, new_port);
break;
}
-
- return 0;
}
-int nf_flow_snat_port(const struct flow_offload *flow,
- struct sk_buff *skb, unsigned int thoff,
- u8 protocol, enum flow_offload_tuple_dir dir)
+void nf_flow_snat_port(const struct flow_offload *flow,
+ struct sk_buff *skb, unsigned int thoff,
+ u8 protocol, enum flow_offload_tuple_dir dir)
{
struct flow_ports *hdr;
__be16 port, new_port;
@@ -455,13 +446,13 @@ int nf_flow_snat_port(const struct flow_
break;
}
- return nf_flow_nat_port(skb, thoff, protocol, port, new_port);
+ nf_flow_nat_port(skb, thoff, protocol, port, new_port);
}
EXPORT_SYMBOL_GPL(nf_flow_snat_port);
-int nf_flow_dnat_port(const struct flow_offload *flow,
- struct sk_buff *skb, unsigned int thoff,
- u8 protocol, enum flow_offload_tuple_dir dir)
+void nf_flow_dnat_port(const struct flow_offload *flow, struct sk_buff *skb,
+ unsigned int thoff, u8 protocol,
+ enum flow_offload_tuple_dir dir)
{
struct flow_ports *hdr;
__be16 port, new_port;
@@ -481,7 +472,7 @@ int nf_flow_dnat_port(const struct flow_
break;
}
- return nf_flow_nat_port(skb, thoff, protocol, port, new_port);
+ nf_flow_nat_port(skb, thoff, protocol, port, new_port);
}
EXPORT_SYMBOL_GPL(nf_flow_dnat_port);
--- a/net/netfilter/nf_flow_table_ip.c
+++ b/net/netfilter/nf_flow_table_ip.c
@@ -34,19 +34,17 @@ static int nf_flow_state_check(struct fl
return 0;
}
-static int nf_flow_nat_ip_tcp(struct sk_buff *skb, unsigned int thoff,
- __be32 addr, __be32 new_addr)
+static void nf_flow_nat_ip_tcp(struct sk_buff *skb, unsigned int thoff,
+ __be32 addr, __be32 new_addr)
{
struct tcphdr *tcph;
tcph = (void *)(skb_network_header(skb) + thoff);
inet_proto_csum_replace4(&tcph->check, skb, addr, new_addr, true);
-
- return 0;
}
-static int nf_flow_nat_ip_udp(struct sk_buff *skb, unsigned int thoff,
- __be32 addr, __be32 new_addr)
+static void nf_flow_nat_ip_udp(struct sk_buff *skb, unsigned int thoff,
+ __be32 addr, __be32 new_addr)
{
struct udphdr *udph;
@@ -57,31 +55,25 @@ static int nf_flow_nat_ip_udp(struct sk_
if (!udph->check)
udph->check = CSUM_MANGLED_0;
}
-
- return 0;
}
-static int nf_flow_nat_ip_l4proto(struct sk_buff *skb, struct iphdr *iph,
- unsigned int thoff, __be32 addr,
- __be32 new_addr)
+static void nf_flow_nat_ip_l4proto(struct sk_buff *skb, struct iphdr *iph,
+ unsigned int thoff, __be32 addr,
+ __be32 new_addr)
{
switch (iph->protocol) {
case IPPROTO_TCP:
- if (nf_flow_nat_ip_tcp(skb, thoff, addr, new_addr) < 0)
- return NF_DROP;
+ nf_flow_nat_ip_tcp(skb, thoff, addr, new_addr);
break;
case IPPROTO_UDP:
- if (nf_flow_nat_ip_udp(skb, thoff, addr, new_addr) < 0)
- return NF_DROP;
+ nf_flow_nat_ip_udp(skb, thoff, addr, new_addr);
break;
}
-
- return 0;
}
-static int nf_flow_snat_ip(const struct flow_offload *flow, struct sk_buff *skb,
- struct iphdr *iph, unsigned int thoff,
- enum flow_offload_tuple_dir dir)
+static void nf_flow_snat_ip(const struct flow_offload *flow,
+ struct sk_buff *skb, struct iphdr *iph,
+ unsigned int thoff, enum flow_offload_tuple_dir dir)
{
__be32 addr, new_addr;
@@ -99,12 +91,12 @@ static int nf_flow_snat_ip(const struct
}
csum_replace4(&iph->check, addr, new_addr);
- return nf_flow_nat_ip_l4proto(skb, iph, thoff, addr, new_addr);
+ nf_flow_nat_ip_l4proto(skb, iph, thoff, addr, new_addr);
}
-static int nf_flow_dnat_ip(const struct flow_offload *flow, struct sk_buff *skb,
- struct iphdr *iph, unsigned int thoff,
- enum flow_offload_tuple_dir dir)
+static void nf_flow_dnat_ip(const struct flow_offload *flow,
+ struct sk_buff *skb, struct iphdr *iph,
+ unsigned int thoff, enum flow_offload_tuple_dir dir)
{
__be32 addr, new_addr;
@@ -122,24 +114,21 @@ static int nf_flow_dnat_ip(const struct
}
csum_replace4(&iph->check, addr, new_addr);
- return nf_flow_nat_ip_l4proto(skb, iph, thoff, addr, new_addr);
+ nf_flow_nat_ip_l4proto(skb, iph, thoff, addr, new_addr);
}
-static int nf_flow_nat_ip(const struct flow_offload *flow, struct sk_buff *skb,
+static void nf_flow_nat_ip(const struct flow_offload *flow, struct sk_buff *skb,
unsigned int thoff, enum flow_offload_tuple_dir dir,
struct iphdr *iph)
{
- if (test_bit(NF_FLOW_SNAT, &flow->flags) &&
- (nf_flow_snat_port(flow, skb, thoff, iph->protocol, dir) < 0 ||
- nf_flow_snat_ip(flow, skb, iph, thoff, dir) < 0))
- return -1;
-
- if (test_bit(NF_FLOW_DNAT, &flow->flags) &&
- (nf_flow_dnat_port(flow, skb, thoff, iph->protocol, dir) < 0 ||
- nf_flow_dnat_ip(flow, skb, iph, thoff, dir) < 0))
- return -1;
-
- return 0;
+ if (test_bit(NF_FLOW_SNAT, &flow->flags)) {
+ nf_flow_snat_port(flow, skb, thoff, iph->protocol, dir);
+ nf_flow_snat_ip(flow, skb, iph, thoff, dir);
+ }
+ if (test_bit(NF_FLOW_DNAT, &flow->flags)) {
+ nf_flow_dnat_port(flow, skb, thoff, iph->protocol, dir);
+ nf_flow_dnat_ip(flow, skb, iph, thoff, dir);
+ }
}
static bool ip_has_options(unsigned int thoff)
@@ -276,8 +265,7 @@ nf_flow_offload_ip_hook(void *priv, stru
return NF_DROP;
iph = ip_hdr(skb);
- if (nf_flow_nat_ip(flow, skb, thoff, dir, iph) < 0)
- return NF_DROP;
+ nf_flow_nat_ip(flow, skb, thoff, dir, iph);
ip_decrease_ttl(iph);
skb->tstamp = 0;
@@ -301,22 +289,21 @@ nf_flow_offload_ip_hook(void *priv, stru
}
EXPORT_SYMBOL_GPL(nf_flow_offload_ip_hook);
-static int nf_flow_nat_ipv6_tcp(struct sk_buff *skb, unsigned int thoff,
- struct in6_addr *addr,
- struct in6_addr *new_addr)
+static void nf_flow_nat_ipv6_tcp(struct sk_buff *skb, unsigned int thoff,
+ struct in6_addr *addr,
+ struct in6_addr *new_addr,
+ struct ipv6hdr *ip6h)
{
struct tcphdr *tcph;
tcph = (void *)(skb_network_header(skb) + thoff);
inet_proto_csum_replace16(&tcph->check, skb, addr->s6_addr32,
new_addr->s6_addr32, true);
-
- return 0;
}
-static int nf_flow_nat_ipv6_udp(struct sk_buff *skb, unsigned int thoff,
- struct in6_addr *addr,
- struct in6_addr *new_addr)
+static void nf_flow_nat_ipv6_udp(struct sk_buff *skb, unsigned int thoff,
+ struct in6_addr *addr,
+ struct in6_addr *new_addr)
{
struct udphdr *udph;
@@ -327,32 +314,26 @@ static int nf_flow_nat_ipv6_udp(struct s
if (!udph->check)
udph->check = CSUM_MANGLED_0;
}
-
- return 0;
}
-static int nf_flow_nat_ipv6_l4proto(struct sk_buff *skb, struct ipv6hdr *ip6h,
- unsigned int thoff, struct in6_addr *addr,
- struct in6_addr *new_addr)
+static void nf_flow_nat_ipv6_l4proto(struct sk_buff *skb, struct ipv6hdr *ip6h,
+ unsigned int thoff, struct in6_addr *addr,
+ struct in6_addr *new_addr)
{
switch (ip6h->nexthdr) {
case IPPROTO_TCP:
- if (nf_flow_nat_ipv6_tcp(skb, thoff, addr, new_addr) < 0)
- return NF_DROP;
+ nf_flow_nat_ipv6_tcp(skb, thoff, addr, new_addr, ip6h);
break;
case IPPROTO_UDP:
- if (nf_flow_nat_ipv6_udp(skb, thoff, addr, new_addr) < 0)
- return NF_DROP;
+ nf_flow_nat_ipv6_udp(skb, thoff, addr, new_addr);
break;
}
-
- return 0;
}
-static int nf_flow_snat_ipv6(const struct flow_offload *flow,
- struct sk_buff *skb, struct ipv6hdr *ip6h,
- unsigned int thoff,
- enum flow_offload_tuple_dir dir)
+static void nf_flow_snat_ipv6(const struct flow_offload *flow,
+ struct sk_buff *skb, struct ipv6hdr *ip6h,
+ unsigned int thoff,
+ enum flow_offload_tuple_dir dir)
{
struct in6_addr addr, new_addr;
@@ -369,13 +350,13 @@ static int nf_flow_snat_ipv6(const struc
break;
}
- return nf_flow_nat_ipv6_l4proto(skb, ip6h, thoff, &addr, &new_addr);
+ nf_flow_nat_ipv6_l4proto(skb, ip6h, thoff, &addr, &new_addr);
}
-static int nf_flow_dnat_ipv6(const struct flow_offload *flow,
- struct sk_buff *skb, struct ipv6hdr *ip6h,
- unsigned int thoff,
- enum flow_offload_tuple_dir dir)
+static void nf_flow_dnat_ipv6(const struct flow_offload *flow,
+ struct sk_buff *skb, struct ipv6hdr *ip6h,
+ unsigned int thoff,
+ enum flow_offload_tuple_dir dir)
{
struct in6_addr addr, new_addr;
@@ -392,27 +373,24 @@ static int nf_flow_dnat_ipv6(const struc
break;
}
- return nf_flow_nat_ipv6_l4proto(skb, ip6h, thoff, &addr, &new_addr);
+ nf_flow_nat_ipv6_l4proto(skb, ip6h, thoff, &addr, &new_addr);
}
-static int nf_flow_nat_ipv6(const struct flow_offload *flow,
- struct sk_buff *skb,
- enum flow_offload_tuple_dir dir,
- struct ipv6hdr *ip6h)
+static void nf_flow_nat_ipv6(const struct flow_offload *flow,
+ struct sk_buff *skb,
+ enum flow_offload_tuple_dir dir,
+ struct ipv6hdr *ip6h)
{
unsigned int thoff = sizeof(*ip6h);
- if (test_bit(NF_FLOW_SNAT, &flow->flags) &&
- (nf_flow_snat_port(flow, skb, thoff, ip6h->nexthdr, dir) < 0 ||
- nf_flow_snat_ipv6(flow, skb, ip6h, thoff, dir) < 0))
- return -1;
-
- if (test_bit(NF_FLOW_DNAT, &flow->flags) &&
- (nf_flow_dnat_port(flow, skb, thoff, ip6h->nexthdr, dir) < 0 ||
- nf_flow_dnat_ipv6(flow, skb, ip6h, thoff, dir) < 0))
- return -1;
-
- return 0;
+ if (test_bit(NF_FLOW_SNAT, &flow->flags)) {
+ nf_flow_snat_port(flow, skb, thoff, ip6h->nexthdr, dir);
+ nf_flow_snat_ipv6(flow, skb, ip6h, thoff, dir);
+ }
+ if (test_bit(NF_FLOW_DNAT, &flow->flags)) {
+ nf_flow_dnat_port(flow, skb, thoff, ip6h->nexthdr, dir);
+ nf_flow_dnat_ipv6(flow, skb, ip6h, thoff, dir);
+ }
}
static int nf_flow_tuple_ipv6(struct sk_buff *skb, const struct net_device *dev,
@@ -507,8 +485,7 @@ nf_flow_offload_ipv6_hook(void *priv, st
return NF_DROP;
ip6h = ipv6_hdr(skb);
- if (nf_flow_nat_ipv6(flow, skb, dir, ip6h) < 0)
- return NF_DROP;
+ nf_flow_nat_ipv6(flow, skb, dir, ip6h);
ip6h->hop_limit--;
skb->tstamp = 0;

View File

@ -1,46 +0,0 @@
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 23 Mar 2021 00:56:26 +0100
Subject: [PATCH] netfilter: flowtable: call dst_check() to fall back to
classic forwarding
In case the route is stale, pass up the packet to the classic forwarding
path for re-evaluation and schedule this flow entry for removal.
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
--- a/net/netfilter/nf_flow_table_ip.c
+++ b/net/netfilter/nf_flow_table_ip.c
@@ -197,14 +197,6 @@ static bool nf_flow_exceeds_mtu(const st
return true;
}
-static int nf_flow_offload_dst_check(struct dst_entry *dst)
-{
- if (unlikely(dst_xfrm(dst)))
- return dst_check(dst, 0) ? 0 : -1;
-
- return 0;
-}
-
static unsigned int nf_flow_xmit_xfrm(struct sk_buff *skb,
const struct nf_hook_state *state,
struct dst_entry *dst)
@@ -256,7 +248,7 @@ nf_flow_offload_ip_hook(void *priv, stru
flow_offload_refresh(flow_table, flow);
- if (nf_flow_offload_dst_check(&rt->dst)) {
+ if (!dst_check(&rt->dst, 0)) {
flow_offload_teardown(flow);
return NF_ACCEPT;
}
@@ -476,7 +468,7 @@ nf_flow_offload_ipv6_hook(void *priv, st
flow_offload_refresh(flow_table, flow);
- if (nf_flow_offload_dst_check(&rt->dst)) {
+ if (!dst_check(&rt->dst, 0)) {
flow_offload_teardown(flow);
return NF_ACCEPT;
}

View File

@ -1,49 +0,0 @@
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 23 Mar 2021 00:56:27 +0100
Subject: [PATCH] netfilter: flowtable: refresh timeout after dst and
writable checks
Refresh the timeout (and retry hardware offload) once the skbuff dst
is confirmed to be current and after the skbuff is made writable.
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
--- a/net/netfilter/nf_flow_table_ip.c
+++ b/net/netfilter/nf_flow_table_ip.c
@@ -246,8 +246,6 @@ nf_flow_offload_ip_hook(void *priv, stru
if (nf_flow_state_check(flow, iph->protocol, skb, thoff))
return NF_ACCEPT;
- flow_offload_refresh(flow_table, flow);
-
if (!dst_check(&rt->dst, 0)) {
flow_offload_teardown(flow);
return NF_ACCEPT;
@@ -256,6 +254,8 @@ nf_flow_offload_ip_hook(void *priv, stru
if (skb_try_make_writable(skb, thoff + hdrsize))
return NF_DROP;
+ flow_offload_refresh(flow_table, flow);
+
iph = ip_hdr(skb);
nf_flow_nat_ip(flow, skb, thoff, dir, iph);
@@ -466,8 +466,6 @@ nf_flow_offload_ipv6_hook(void *priv, st
sizeof(*ip6h)))
return NF_ACCEPT;
- flow_offload_refresh(flow_table, flow);
-
if (!dst_check(&rt->dst, 0)) {
flow_offload_teardown(flow);
return NF_ACCEPT;
@@ -476,6 +474,8 @@ nf_flow_offload_ipv6_hook(void *priv, st
if (skb_try_make_writable(skb, sizeof(*ip6h) + hdrsize))
return NF_DROP;
+ flow_offload_refresh(flow_table, flow);
+
ip6h = ipv6_hdr(skb);
nf_flow_nat_ipv6(flow, skb, dir, ip6h);

View File

@ -1,103 +0,0 @@
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 23 Mar 2021 00:56:28 +0100
Subject: [PATCH] netfilter: nftables: update table flags from the commit
phase
Do not update table flags from the preparation phase. Store the flags
update into the transaction, then update the flags from the commit
phase.
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -1470,13 +1470,16 @@ struct nft_trans_chain {
struct nft_trans_table {
bool update;
- bool enable;
+ u8 state;
+ u32 flags;
};
#define nft_trans_table_update(trans) \
(((struct nft_trans_table *)trans->data)->update)
-#define nft_trans_table_enable(trans) \
- (((struct nft_trans_table *)trans->data)->enable)
+#define nft_trans_table_state(trans) \
+ (((struct nft_trans_table *)trans->data)->state)
+#define nft_trans_table_flags(trans) \
+ (((struct nft_trans_table *)trans->data)->flags)
struct nft_trans_elem {
struct nft_set *set;
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -891,6 +891,12 @@ static void nf_tables_table_disable(stru
nft_table_disable(net, table, 0);
}
+enum {
+ NFT_TABLE_STATE_UNCHANGED = 0,
+ NFT_TABLE_STATE_DORMANT,
+ NFT_TABLE_STATE_WAKEUP
+};
+
static int nf_tables_updtable(struct nft_ctx *ctx)
{
struct nft_trans *trans;
@@ -914,19 +920,17 @@ static int nf_tables_updtable(struct nft
if ((flags & NFT_TABLE_F_DORMANT) &&
!(ctx->table->flags & NFT_TABLE_F_DORMANT)) {
- nft_trans_table_enable(trans) = false;
+ nft_trans_table_state(trans) = NFT_TABLE_STATE_DORMANT;
} else if (!(flags & NFT_TABLE_F_DORMANT) &&
ctx->table->flags & NFT_TABLE_F_DORMANT) {
- ctx->table->flags &= ~NFT_TABLE_F_DORMANT;
ret = nf_tables_table_enable(ctx->net, ctx->table);
if (ret >= 0)
- nft_trans_table_enable(trans) = true;
- else
- ctx->table->flags |= NFT_TABLE_F_DORMANT;
+ nft_trans_table_state(trans) = NFT_TABLE_STATE_WAKEUP;
}
if (ret < 0)
goto err;
+ nft_trans_table_flags(trans) = flags;
nft_trans_table_update(trans) = true;
list_add_tail(&trans->list, &ctx->net->nft.commit_list);
return 0;
@@ -7908,11 +7912,10 @@ static int nf_tables_commit(struct net *
switch (trans->msg_type) {
case NFT_MSG_NEWTABLE:
if (nft_trans_table_update(trans)) {
- if (!nft_trans_table_enable(trans)) {
- nf_tables_table_disable(net,
- trans->ctx.table);
- trans->ctx.table->flags |= NFT_TABLE_F_DORMANT;
- }
+ if (nft_trans_table_state(trans) == NFT_TABLE_STATE_DORMANT)
+ nf_tables_table_disable(net, trans->ctx.table);
+
+ trans->ctx.table->flags = nft_trans_table_flags(trans);
} else {
nft_clear(net, trans->ctx.table);
}
@@ -8125,11 +8128,9 @@ static int __nf_tables_abort(struct net
switch (trans->msg_type) {
case NFT_MSG_NEWTABLE:
if (nft_trans_table_update(trans)) {
- if (nft_trans_table_enable(trans)) {
- nf_tables_table_disable(net,
- trans->ctx.table);
- trans->ctx.table->flags |= NFT_TABLE_F_DORMANT;
- }
+ if (nft_trans_table_state(trans) == NFT_TABLE_STATE_WAKEUP)
+ nf_tables_table_disable(net, trans->ctx.table);
+
nft_trans_destroy(trans);
} else {
list_del_rcu(&trans->ctx.table->list);

View File

@ -1,170 +0,0 @@
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Wed, 24 Mar 2021 02:30:32 +0100
Subject: [PATCH] net: resolve forwarding path from virtual netdevice and
HW destination address
This patch adds dev_fill_forward_path() which resolves the path to reach
the real netdevice from the IP forwarding side. This function takes as
input the netdevice and the destination hardware address and it walks
down the devices calling .ndo_fill_forward_path() for each device until
the real device is found.
For instance, assuming the following topology:
IP forwarding
/ \
br0 eth0
/ \
eth1 eth2
.
.
.
ethX
ab:cd:ef:ab:cd:ef
where eth1 and eth2 are bridge ports and eth0 provides WAN connectivity.
ethX is the interface in another box which is connected to the eth1
bridge port.
For packets going through IP forwarding to br0 whose destination MAC
address is ab:cd:ef:ab:cd:ef, dev_fill_forward_path() provides the
following path:
br0 -> eth1
.ndo_fill_forward_path for br0 looks up at the FDB for the bridge port
from the destination MAC address to get the bridge port eth1.
This information allows to create a fast path that bypasses the classic
bridge and IP forwarding paths, so packets go directly from the bridge
port eth1 to eth0 (wan interface) and vice versa.
fast path
.------------------------.
/ \
| IP forwarding |
| / \ \/
| br0 eth0
. / \
-> eth1 eth2
.
.
.
ethX
ab:cd:ef:ab:cd:ef
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -827,6 +827,27 @@ typedef u16 (*select_queue_fallback_t)(s
struct sk_buff *skb,
struct net_device *sb_dev);
+enum net_device_path_type {
+ DEV_PATH_ETHERNET = 0,
+};
+
+struct net_device_path {
+ enum net_device_path_type type;
+ const struct net_device *dev;
+};
+
+#define NET_DEVICE_PATH_STACK_MAX 5
+
+struct net_device_path_stack {
+ int num_paths;
+ struct net_device_path path[NET_DEVICE_PATH_STACK_MAX];
+};
+
+struct net_device_path_ctx {
+ const struct net_device *dev;
+ const u8 *daddr;
+};
+
enum tc_setup_type {
TC_SETUP_QDISC_MQPRIO,
TC_SETUP_CLSU32,
@@ -1273,6 +1294,8 @@ struct netdev_net_notifier {
* struct net_device *(*ndo_get_peer_dev)(struct net_device *dev);
* If a device is paired with a peer device, return the peer instance.
* The caller must be under RCU read context.
+ * int (*ndo_fill_forward_path)(struct net_device_path_ctx *ctx, struct net_device_path *path);
+ * Get the forwarding path to reach the real device from the HW destination address
*/
struct net_device_ops {
int (*ndo_init)(struct net_device *dev);
@@ -1481,6 +1504,8 @@ struct net_device_ops {
int (*ndo_tunnel_ctl)(struct net_device *dev,
struct ip_tunnel_parm *p, int cmd);
struct net_device * (*ndo_get_peer_dev)(struct net_device *dev);
+ int (*ndo_fill_forward_path)(struct net_device_path_ctx *ctx,
+ struct net_device_path *path);
};
/**
@@ -2828,6 +2853,8 @@ void dev_remove_offload(struct packet_of
int dev_get_iflink(const struct net_device *dev);
int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb);
+int dev_fill_forward_path(const struct net_device *dev, const u8 *daddr,
+ struct net_device_path_stack *stack);
struct net_device *__dev_get_by_flags(struct net *net, unsigned short flags,
unsigned short mask);
struct net_device *dev_get_by_name(struct net *net, const char *name);
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -847,6 +847,52 @@ int dev_fill_metadata_dst(struct net_dev
}
EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
+static struct net_device_path *dev_fwd_path(struct net_device_path_stack *stack)
+{
+ int k = stack->num_paths++;
+
+ if (WARN_ON_ONCE(k >= NET_DEVICE_PATH_STACK_MAX))
+ return NULL;
+
+ return &stack->path[k];
+}
+
+int dev_fill_forward_path(const struct net_device *dev, const u8 *daddr,
+ struct net_device_path_stack *stack)
+{
+ const struct net_device *last_dev;
+ struct net_device_path_ctx ctx = {
+ .dev = dev,
+ .daddr = daddr,
+ };
+ struct net_device_path *path;
+ int ret = 0;
+
+ stack->num_paths = 0;
+ while (ctx.dev && ctx.dev->netdev_ops->ndo_fill_forward_path) {
+ last_dev = ctx.dev;
+ path = dev_fwd_path(stack);
+ if (!path)
+ return -1;
+
+ memset(path, 0, sizeof(struct net_device_path));
+ ret = ctx.dev->netdev_ops->ndo_fill_forward_path(&ctx, path);
+ if (ret < 0)
+ return -1;
+
+ if (WARN_ON_ONCE(last_dev == ctx.dev))
+ return -1;
+ }
+ path = dev_fwd_path(stack);
+ if (!path)
+ return -1;
+ path->type = DEV_PATH_ETHERNET;
+ path->dev = ctx.dev;
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(dev_fill_forward_path);
+
/**
* __dev_get_by_name - find a device by its name
* @net: the applicable net namespace

View File

@ -1,80 +0,0 @@
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Wed, 24 Mar 2021 02:30:33 +0100
Subject: [PATCH] net: 8021q: resolve forwarding path for vlan devices
Add .ndo_fill_forward_path for vlan devices.
For instance, assuming the following topology:
IP forwarding
/ \
eth0.100 eth0
|
eth0
.
.
.
ethX
ab:cd:ef:ab:cd:ef
For packets going through IP forwarding to eth0.100 whose destination
MAC address is ab:cd:ef:ab:cd:ef, dev_fill_forward_path() provides the
following path:
eth0.100 -> eth0
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -829,11 +829,18 @@ typedef u16 (*select_queue_fallback_t)(s
enum net_device_path_type {
DEV_PATH_ETHERNET = 0,
+ DEV_PATH_VLAN,
};
struct net_device_path {
enum net_device_path_type type;
const struct net_device *dev;
+ union {
+ struct {
+ u16 id;
+ __be16 proto;
+ } encap;
+ };
};
#define NET_DEVICE_PATH_STACK_MAX 5
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -770,6 +770,20 @@ static int vlan_dev_get_iflink(const str
return real_dev->ifindex;
}
+static int vlan_dev_fill_forward_path(struct net_device_path_ctx *ctx,
+ struct net_device_path *path)
+{
+ struct vlan_dev_priv *vlan = vlan_dev_priv(ctx->dev);
+
+ path->type = DEV_PATH_VLAN;
+ path->encap.id = vlan->vlan_id;
+ path->encap.proto = vlan->vlan_proto;
+ path->dev = ctx->dev;
+ ctx->dev = vlan->real_dev;
+
+ return 0;
+}
+
static const struct ethtool_ops vlan_ethtool_ops = {
.get_link_ksettings = vlan_ethtool_get_link_ksettings,
.get_drvinfo = vlan_ethtool_get_drvinfo,
@@ -808,6 +822,7 @@ static const struct net_device_ops vlan_
#endif
.ndo_fix_features = vlan_dev_fix_features,
.ndo_get_iflink = vlan_dev_get_iflink,
+ .ndo_fill_forward_path = vlan_dev_fill_forward_path,
};
static void vlan_dev_free(struct net_device *dev)

View File

@ -1,62 +0,0 @@
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Wed, 24 Mar 2021 02:30:34 +0100
Subject: [PATCH] net: bridge: resolve forwarding path for bridge devices
Add .ndo_fill_forward_path for bridge devices.
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -830,6 +830,7 @@ typedef u16 (*select_queue_fallback_t)(s
enum net_device_path_type {
DEV_PATH_ETHERNET = 0,
DEV_PATH_VLAN,
+ DEV_PATH_BRIDGE,
};
struct net_device_path {
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -398,6 +398,32 @@ static int br_del_slave(struct net_devic
return br_del_if(br, slave_dev);
}
+static int br_fill_forward_path(struct net_device_path_ctx *ctx,
+ struct net_device_path *path)
+{
+ struct net_bridge_fdb_entry *f;
+ struct net_bridge_port *dst;
+ struct net_bridge *br;
+
+ if (netif_is_bridge_port(ctx->dev))
+ return -1;
+
+ br = netdev_priv(ctx->dev);
+ f = br_fdb_find_rcu(br, ctx->daddr, 0);
+ if (!f || !f->dst)
+ return -1;
+
+ dst = READ_ONCE(f->dst);
+ if (!dst)
+ return -1;
+
+ path->type = DEV_PATH_BRIDGE;
+ path->dev = dst->br->dev;
+ ctx->dev = dst->dev;
+
+ return 0;
+}
+
static const struct ethtool_ops br_ethtool_ops = {
.get_drvinfo = br_getinfo,
.get_link = ethtool_op_get_link,
@@ -432,6 +458,7 @@ static const struct net_device_ops br_ne
.ndo_bridge_setlink = br_setlink,
.ndo_bridge_dellink = br_dellink,
.ndo_features_check = passthru_features_check,
+ .ndo_fill_forward_path = br_fill_forward_path,
};
static struct device_type br_type = {

View File

@ -1,207 +0,0 @@
From: Felix Fietkau <nbd@nbd.name>
Date: Wed, 24 Mar 2021 02:30:35 +0100
Subject: [PATCH] net: bridge: resolve forwarding path for VLAN tag
actions in bridge devices
Depending on the VLAN settings of the bridge and the port, the bridge can
either add or remove a tag. When vlan filtering is enabled, the fdb lookup
also needs to know the VLAN tag/proto for the destination address
To provide this, keep track of the stack of VLAN tags for the path in the
lookup context
Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -841,10 +841,20 @@ struct net_device_path {
u16 id;
__be16 proto;
} encap;
+ struct {
+ enum {
+ DEV_PATH_BR_VLAN_KEEP,
+ DEV_PATH_BR_VLAN_TAG,
+ DEV_PATH_BR_VLAN_UNTAG,
+ } vlan_mode;
+ u16 vlan_id;
+ __be16 vlan_proto;
+ } bridge;
};
};
#define NET_DEVICE_PATH_STACK_MAX 5
+#define NET_DEVICE_PATH_VLAN_MAX 2
struct net_device_path_stack {
int num_paths;
@@ -854,6 +864,12 @@ struct net_device_path_stack {
struct net_device_path_ctx {
const struct net_device *dev;
const u8 *daddr;
+
+ int num_vlans;
+ struct {
+ u16 id;
+ __be16 proto;
+ } vlan[NET_DEVICE_PATH_VLAN_MAX];
};
enum tc_setup_type {
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -780,6 +780,12 @@ static int vlan_dev_fill_forward_path(st
path->encap.proto = vlan->vlan_proto;
path->dev = ctx->dev;
ctx->dev = vlan->real_dev;
+ if (ctx->num_vlans >= ARRAY_SIZE(ctx->vlan))
+ return -ENOSPC;
+
+ ctx->vlan[ctx->num_vlans].id = vlan->vlan_id;
+ ctx->vlan[ctx->num_vlans].proto = vlan->vlan_proto;
+ ctx->num_vlans++;
return 0;
}
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -409,7 +409,10 @@ static int br_fill_forward_path(struct n
return -1;
br = netdev_priv(ctx->dev);
- f = br_fdb_find_rcu(br, ctx->daddr, 0);
+
+ br_vlan_fill_forward_path_pvid(br, ctx, path);
+
+ f = br_fdb_find_rcu(br, ctx->daddr, path->bridge.vlan_id);
if (!f || !f->dst)
return -1;
@@ -417,10 +420,28 @@ static int br_fill_forward_path(struct n
if (!dst)
return -1;
+ if (br_vlan_fill_forward_path_mode(br, dst, path))
+ return -1;
+
path->type = DEV_PATH_BRIDGE;
path->dev = dst->br->dev;
ctx->dev = dst->dev;
+ switch (path->bridge.vlan_mode) {
+ case DEV_PATH_BR_VLAN_TAG:
+ if (ctx->num_vlans >= ARRAY_SIZE(ctx->vlan))
+ return -ENOSPC;
+ ctx->vlan[ctx->num_vlans].id = path->bridge.vlan_id;
+ ctx->vlan[ctx->num_vlans].proto = path->bridge.vlan_proto;
+ ctx->num_vlans++;
+ break;
+ case DEV_PATH_BR_VLAN_UNTAG:
+ ctx->num_vlans--;
+ break;
+ case DEV_PATH_BR_VLAN_KEEP:
+ break;
+ }
+
return 0;
}
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -1093,6 +1093,13 @@ void br_vlan_notify(const struct net_bri
bool br_vlan_can_enter_range(const struct net_bridge_vlan *v_curr,
const struct net_bridge_vlan *range_end);
+void br_vlan_fill_forward_path_pvid(struct net_bridge *br,
+ struct net_device_path_ctx *ctx,
+ struct net_device_path *path);
+int br_vlan_fill_forward_path_mode(struct net_bridge *br,
+ struct net_bridge_port *dst,
+ struct net_device_path *path);
+
static inline struct net_bridge_vlan_group *br_vlan_group(
const struct net_bridge *br)
{
@@ -1250,6 +1257,19 @@ static inline int nbp_get_num_vlan_infos
{
return 0;
}
+
+static inline void br_vlan_fill_forward_path_pvid(struct net_bridge *br,
+ struct net_device_path_ctx *ctx,
+ struct net_device_path *path)
+{
+}
+
+static inline int br_vlan_fill_forward_path_mode(struct net_bridge *br,
+ struct net_bridge_port *dst,
+ struct net_device_path *path)
+{
+ return 0;
+}
static inline struct net_bridge_vlan_group *br_vlan_group(
const struct net_bridge *br)
--- a/net/bridge/br_vlan.c
+++ b/net/bridge/br_vlan.c
@@ -1327,6 +1327,59 @@ int br_vlan_get_pvid_rcu(const struct ne
}
EXPORT_SYMBOL_GPL(br_vlan_get_pvid_rcu);
+void br_vlan_fill_forward_path_pvid(struct net_bridge *br,
+ struct net_device_path_ctx *ctx,
+ struct net_device_path *path)
+{
+ struct net_bridge_vlan_group *vg;
+ int idx = ctx->num_vlans - 1;
+ u16 vid;
+
+ path->bridge.vlan_mode = DEV_PATH_BR_VLAN_KEEP;
+
+ if (!br_opt_get(br, BROPT_VLAN_ENABLED))
+ return;
+
+ vg = br_vlan_group(br);
+
+ if (idx >= 0 &&
+ ctx->vlan[idx].proto == br->vlan_proto) {
+ vid = ctx->vlan[idx].id;
+ } else {
+ path->bridge.vlan_mode = DEV_PATH_BR_VLAN_TAG;
+ vid = br_get_pvid(vg);
+ }
+
+ path->bridge.vlan_id = vid;
+ path->bridge.vlan_proto = br->vlan_proto;
+}
+
+int br_vlan_fill_forward_path_mode(struct net_bridge *br,
+ struct net_bridge_port *dst,
+ struct net_device_path *path)
+{
+ struct net_bridge_vlan_group *vg;
+ struct net_bridge_vlan *v;
+
+ if (!br_opt_get(br, BROPT_VLAN_ENABLED))
+ return 0;
+
+ vg = nbp_vlan_group_rcu(dst);
+ v = br_vlan_find(vg, path->bridge.vlan_id);
+ if (!v || !br_vlan_should_use(v))
+ return -EINVAL;
+
+ if (!(v->flags & BRIDGE_VLAN_INFO_UNTAGGED))
+ return 0;
+
+ if (path->bridge.vlan_mode == DEV_PATH_BR_VLAN_TAG)
+ path->bridge.vlan_mode = DEV_PATH_BR_VLAN_KEEP;
+ else
+ path->bridge.vlan_mode = DEV_PATH_BR_VLAN_UNTAG;
+
+ return 0;
+}
+
int br_vlan_get_info(const struct net_device *dev, u16 vid,
struct bridge_vlan_info *p_vinfo)
{

View File

@ -1,113 +0,0 @@
From: Felix Fietkau <nbd@nbd.name>
Date: Wed, 24 Mar 2021 02:30:36 +0100
Subject: [PATCH] net: ppp: resolve forwarding path for bridge pppoe
devices
Pass on the PPPoE session ID, destination hardware address and the real
device.
Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
--- a/drivers/net/ppp/ppp_generic.c
+++ b/drivers/net/ppp/ppp_generic.c
@@ -1466,12 +1466,34 @@ static void ppp_dev_priv_destructor(stru
ppp_destroy_interface(ppp);
}
+static int ppp_fill_forward_path(struct net_device_path_ctx *ctx,
+ struct net_device_path *path)
+{
+ struct ppp *ppp = netdev_priv(ctx->dev);
+ struct ppp_channel *chan;
+ struct channel *pch;
+
+ if (ppp->flags & SC_MULTILINK)
+ return -EOPNOTSUPP;
+
+ if (list_empty(&ppp->channels))
+ return -ENODEV;
+
+ pch = list_first_entry(&ppp->channels, struct channel, clist);
+ chan = pch->chan;
+ if (!chan->ops->fill_forward_path)
+ return -EOPNOTSUPP;
+
+ return chan->ops->fill_forward_path(ctx, path, chan);
+}
+
static const struct net_device_ops ppp_netdev_ops = {
.ndo_init = ppp_dev_init,
.ndo_uninit = ppp_dev_uninit,
.ndo_start_xmit = ppp_start_xmit,
.ndo_do_ioctl = ppp_net_ioctl,
.ndo_get_stats64 = ppp_get_stats64,
+ .ndo_fill_forward_path = ppp_fill_forward_path,
};
static struct device_type ppp_type = {
--- a/drivers/net/ppp/pppoe.c
+++ b/drivers/net/ppp/pppoe.c
@@ -972,8 +972,31 @@ static int pppoe_xmit(struct ppp_channel
return __pppoe_xmit(sk, skb);
}
+static int pppoe_fill_forward_path(struct net_device_path_ctx *ctx,
+ struct net_device_path *path,
+ const struct ppp_channel *chan)
+{
+ struct sock *sk = (struct sock *)chan->private;
+ struct pppox_sock *po = pppox_sk(sk);
+ struct net_device *dev = po->pppoe_dev;
+
+ if (sock_flag(sk, SOCK_DEAD) ||
+ !(sk->sk_state & PPPOX_CONNECTED) || !dev)
+ return -1;
+
+ path->type = DEV_PATH_PPPOE;
+ path->encap.proto = htons(ETH_P_PPP_SES);
+ path->encap.id = be16_to_cpu(po->num);
+ memcpy(path->encap.h_dest, po->pppoe_pa.remote, ETH_ALEN);
+ path->dev = ctx->dev;
+ ctx->dev = dev;
+
+ return 0;
+}
+
static const struct ppp_channel_ops pppoe_chan_ops = {
.start_xmit = pppoe_xmit,
+ .fill_forward_path = pppoe_fill_forward_path,
};
static int pppoe_recvmsg(struct socket *sock, struct msghdr *m,
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -831,6 +831,7 @@ enum net_device_path_type {
DEV_PATH_ETHERNET = 0,
DEV_PATH_VLAN,
DEV_PATH_BRIDGE,
+ DEV_PATH_PPPOE,
};
struct net_device_path {
@@ -840,6 +841,7 @@ struct net_device_path {
struct {
u16 id;
__be16 proto;
+ u8 h_dest[ETH_ALEN];
} encap;
struct {
enum {
--- a/include/linux/ppp_channel.h
+++ b/include/linux/ppp_channel.h
@@ -28,6 +28,9 @@ struct ppp_channel_ops {
int (*start_xmit)(struct ppp_channel *, struct sk_buff *);
/* Handle an ioctl call that has come in via /dev/ppp. */
int (*ioctl)(struct ppp_channel *, unsigned int, unsigned long);
+ int (*fill_forward_path)(struct net_device_path_ctx *,
+ struct net_device_path *,
+ const struct ppp_channel *);
};
struct ppp_channel {

View File

@ -1,63 +0,0 @@
From: Felix Fietkau <nbd@nbd.name>
Date: Wed, 24 Mar 2021 02:30:37 +0100
Subject: [PATCH] net: dsa: resolve forwarding path for dsa slave ports
Add .ndo_fill_forward_path for dsa slave port devices
Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -832,6 +832,7 @@ enum net_device_path_type {
DEV_PATH_VLAN,
DEV_PATH_BRIDGE,
DEV_PATH_PPPOE,
+ DEV_PATH_DSA,
};
struct net_device_path {
@@ -852,6 +853,10 @@ struct net_device_path {
u16 vlan_id;
__be16 vlan_proto;
} bridge;
+ struct {
+ int port;
+ u16 proto;
+ } dsa;
};
};
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -1619,6 +1619,21 @@ static struct devlink_port *dsa_slave_ge
return dp->ds->devlink ? &dp->devlink_port : NULL;
}
+static int dsa_slave_fill_forward_path(struct net_device_path_ctx *ctx,
+ struct net_device_path *path)
+{
+ struct dsa_port *dp = dsa_slave_to_port(ctx->dev);
+ struct dsa_port *cpu_dp = dp->cpu_dp;
+
+ path->dev = ctx->dev;
+ path->type = DEV_PATH_DSA;
+ path->dsa.proto = cpu_dp->tag_ops->proto;
+ path->dsa.port = dp->index;
+ ctx->dev = cpu_dp->master;
+
+ return 0;
+}
+
static const struct net_device_ops dsa_slave_netdev_ops = {
.ndo_open = dsa_slave_open,
.ndo_stop = dsa_slave_close,
@@ -1644,6 +1659,7 @@ static const struct net_device_ops dsa_s
.ndo_vlan_rx_kill_vid = dsa_slave_vlan_rx_kill_vid,
.ndo_get_devlink_port = dsa_slave_get_devlink_port,
.ndo_change_mtu = dsa_slave_change_mtu,
+ .ndo_fill_forward_path = dsa_slave_fill_forward_path,
};
static struct device_type dsa_type = {

View File

@ -1,147 +0,0 @@
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Wed, 24 Mar 2021 02:30:38 +0100
Subject: [PATCH] netfilter: flowtable: add xmit path types
Add the xmit_type field that defines the two supported xmit paths in the
flowtable data plane, which are the neighbour and the xfrm xmit paths.
This patch prepares for new flowtable xmit path types to come.
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
--- a/include/net/netfilter/nf_flow_table.h
+++ b/include/net/netfilter/nf_flow_table.h
@@ -89,6 +89,11 @@ enum flow_offload_tuple_dir {
};
#define FLOW_OFFLOAD_DIR_MAX IP_CT_DIR_MAX
+enum flow_offload_xmit_type {
+ FLOW_OFFLOAD_XMIT_NEIGH = 0,
+ FLOW_OFFLOAD_XMIT_XFRM,
+};
+
struct flow_offload_tuple {
union {
struct in_addr src_v4;
@@ -111,7 +116,8 @@ struct flow_offload_tuple {
/* All members above are keys for lookups, see flow_offload_hash(). */
struct { } __hash;
- u8 dir;
+ u8 dir:6,
+ xmit_type:2;
u16 mtu;
@@ -157,7 +163,8 @@ static inline __s32 nf_flow_timeout_delt
struct nf_flow_route {
struct {
- struct dst_entry *dst;
+ struct dst_entry *dst;
+ enum flow_offload_xmit_type xmit_type;
} tuple[FLOW_OFFLOAD_DIR_MAX];
};
--- a/net/netfilter/nf_flow_table_core.c
+++ b/net/netfilter/nf_flow_table_core.c
@@ -95,6 +95,7 @@ static int flow_offload_fill_route(struc
}
flow_tuple->iifidx = other_dst->dev->ifindex;
+ flow_tuple->xmit_type = route->tuple[dir].xmit_type;
flow_tuple->dst_cache = dst;
return 0;
--- a/net/netfilter/nf_flow_table_ip.c
+++ b/net/netfilter/nf_flow_table_ip.c
@@ -235,8 +235,6 @@ nf_flow_offload_ip_hook(void *priv, stru
dir = tuplehash->tuple.dir;
flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
- rt = (struct rtable *)flow->tuplehash[dir].tuple.dst_cache;
- outdev = rt->dst.dev;
if (unlikely(nf_flow_exceeds_mtu(skb, flow->tuplehash[dir].tuple.mtu)))
return NF_ACCEPT;
@@ -265,13 +263,16 @@ nf_flow_offload_ip_hook(void *priv, stru
if (flow_table->flags & NF_FLOWTABLE_COUNTER)
nf_ct_acct_update(flow->ct, tuplehash->tuple.dir, skb->len);
- if (unlikely(dst_xfrm(&rt->dst))) {
+ rt = (struct rtable *)tuplehash->tuple.dst_cache;
+
+ if (unlikely(tuplehash->tuple.xmit_type == FLOW_OFFLOAD_XMIT_XFRM)) {
memset(skb->cb, 0, sizeof(struct inet_skb_parm));
IPCB(skb)->iif = skb->dev->ifindex;
IPCB(skb)->flags = IPSKB_FORWARDED;
return nf_flow_xmit_xfrm(skb, state, &rt->dst);
}
+ outdev = rt->dst.dev;
skb->dev = outdev;
nexthop = rt_nexthop(rt, flow->tuplehash[!dir].tuple.src_v4.s_addr);
skb_dst_set_noref(skb, &rt->dst);
@@ -456,8 +457,6 @@ nf_flow_offload_ipv6_hook(void *priv, st
dir = tuplehash->tuple.dir;
flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
- rt = (struct rt6_info *)flow->tuplehash[dir].tuple.dst_cache;
- outdev = rt->dst.dev;
if (unlikely(nf_flow_exceeds_mtu(skb, flow->tuplehash[dir].tuple.mtu)))
return NF_ACCEPT;
@@ -485,13 +484,16 @@ nf_flow_offload_ipv6_hook(void *priv, st
if (flow_table->flags & NF_FLOWTABLE_COUNTER)
nf_ct_acct_update(flow->ct, tuplehash->tuple.dir, skb->len);
- if (unlikely(dst_xfrm(&rt->dst))) {
+ rt = (struct rt6_info *)tuplehash->tuple.dst_cache;
+
+ if (unlikely(tuplehash->tuple.xmit_type == FLOW_OFFLOAD_XMIT_XFRM)) {
memset(skb->cb, 0, sizeof(struct inet6_skb_parm));
IP6CB(skb)->iif = skb->dev->ifindex;
IP6CB(skb)->flags = IP6SKB_FORWARDED;
return nf_flow_xmit_xfrm(skb, state, &rt->dst);
}
+ outdev = rt->dst.dev;
skb->dev = outdev;
nexthop = rt6_nexthop(rt, &flow->tuplehash[!dir].tuple.src_v6);
skb_dst_set_noref(skb, &rt->dst);
--- a/net/netfilter/nft_flow_offload.c
+++ b/net/netfilter/nft_flow_offload.c
@@ -19,6 +19,22 @@ struct nft_flow_offload {
struct nft_flowtable *flowtable;
};
+static enum flow_offload_xmit_type nft_xmit_type(struct dst_entry *dst)
+{
+ if (dst_xfrm(dst))
+ return FLOW_OFFLOAD_XMIT_XFRM;
+
+ return FLOW_OFFLOAD_XMIT_NEIGH;
+}
+
+static void nft_default_forward_path(struct nf_flow_route *route,
+ struct dst_entry *dst_cache,
+ enum ip_conntrack_dir dir)
+{
+ route->tuple[dir].dst = dst_cache;
+ route->tuple[dir].xmit_type = nft_xmit_type(dst_cache);
+}
+
static int nft_flow_route(const struct nft_pktinfo *pkt,
const struct nf_conn *ct,
struct nf_flow_route *route,
@@ -44,8 +60,8 @@ static int nft_flow_route(const struct n
if (!other_dst)
return -ENOENT;
- route->tuple[dir].dst = this_dst;
- route->tuple[!dir].dst = other_dst;
+ nft_default_forward_path(route, this_dst, dir);
+ nft_default_forward_path(route, other_dst, !dir);
return 0;
}

View File

@ -1,191 +0,0 @@
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Wed, 24 Mar 2021 02:30:39 +0100
Subject: [PATCH] netfilter: flowtable: use dev_fill_forward_path() to
obtain ingress device
Obtain the ingress device in the tuple from the route in the reply
direction. Use dev_fill_forward_path() instead to get the real ingress
device for this flow.
Fall back to use the ingress device that the IP forwarding route
provides if:
- dev_fill_forward_path() finds no real ingress device.
- the ingress device that is obtained is not part of the flowtable
devices.
- this route has a xfrm policy.
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
--- a/include/net/netfilter/nf_flow_table.h
+++ b/include/net/netfilter/nf_flow_table.h
@@ -164,6 +164,9 @@ static inline __s32 nf_flow_timeout_delt
struct nf_flow_route {
struct {
struct dst_entry *dst;
+ struct {
+ u32 ifindex;
+ } in;
enum flow_offload_xmit_type xmit_type;
} tuple[FLOW_OFFLOAD_DIR_MAX];
};
--- a/net/netfilter/nf_flow_table_core.c
+++ b/net/netfilter/nf_flow_table_core.c
@@ -79,7 +79,6 @@ static int flow_offload_fill_route(struc
enum flow_offload_tuple_dir dir)
{
struct flow_offload_tuple *flow_tuple = &flow->tuplehash[dir].tuple;
- struct dst_entry *other_dst = route->tuple[!dir].dst;
struct dst_entry *dst = route->tuple[dir].dst;
if (!dst_hold_safe(route->tuple[dir].dst))
@@ -94,7 +93,7 @@ static int flow_offload_fill_route(struc
break;
}
- flow_tuple->iifidx = other_dst->dev->ifindex;
+ flow_tuple->iifidx = route->tuple[dir].in.ifindex;
flow_tuple->xmit_type = route->tuple[dir].xmit_type;
flow_tuple->dst_cache = dst;
--- a/net/netfilter/nft_flow_offload.c
+++ b/net/netfilter/nft_flow_offload.c
@@ -31,14 +31,104 @@ static void nft_default_forward_path(str
struct dst_entry *dst_cache,
enum ip_conntrack_dir dir)
{
+ route->tuple[!dir].in.ifindex = dst_cache->dev->ifindex;
route->tuple[dir].dst = dst_cache;
route->tuple[dir].xmit_type = nft_xmit_type(dst_cache);
}
+static int nft_dev_fill_forward_path(const struct nf_flow_route *route,
+ const struct dst_entry *dst_cache,
+ const struct nf_conn *ct,
+ enum ip_conntrack_dir dir,
+ struct net_device_path_stack *stack)
+{
+ const void *daddr = &ct->tuplehash[!dir].tuple.src.u3;
+ struct net_device *dev = dst_cache->dev;
+ unsigned char ha[ETH_ALEN];
+ struct neighbour *n;
+ u8 nud_state;
+
+ n = dst_neigh_lookup(dst_cache, daddr);
+ if (!n)
+ return -1;
+
+ read_lock_bh(&n->lock);
+ nud_state = n->nud_state;
+ ether_addr_copy(ha, n->ha);
+ read_unlock_bh(&n->lock);
+ neigh_release(n);
+
+ if (!(nud_state & NUD_VALID))
+ return -1;
+
+ return dev_fill_forward_path(dev, ha, stack);
+}
+
+struct nft_forward_info {
+ const struct net_device *indev;
+};
+
+static void nft_dev_path_info(const struct net_device_path_stack *stack,
+ struct nft_forward_info *info)
+{
+ const struct net_device_path *path;
+ int i;
+
+ for (i = 0; i < stack->num_paths; i++) {
+ path = &stack->path[i];
+ switch (path->type) {
+ case DEV_PATH_ETHERNET:
+ info->indev = path->dev;
+ break;
+ case DEV_PATH_VLAN:
+ case DEV_PATH_BRIDGE:
+ default:
+ info->indev = NULL;
+ break;
+ }
+ }
+}
+
+static bool nft_flowtable_find_dev(const struct net_device *dev,
+ struct nft_flowtable *ft)
+{
+ struct nft_hook *hook;
+ bool found = false;
+
+ list_for_each_entry_rcu(hook, &ft->hook_list, list) {
+ if (hook->ops.dev != dev)
+ continue;
+
+ found = true;
+ break;
+ }
+
+ return found;
+}
+
+static void nft_dev_forward_path(struct nf_flow_route *route,
+ const struct nf_conn *ct,
+ enum ip_conntrack_dir dir,
+ struct nft_flowtable *ft)
+{
+ const struct dst_entry *dst = route->tuple[dir].dst;
+ struct net_device_path_stack stack;
+ struct nft_forward_info info = {};
+
+ if (nft_dev_fill_forward_path(route, dst, ct, dir, &stack) >= 0)
+ nft_dev_path_info(&stack, &info);
+
+ if (!info.indev || !nft_flowtable_find_dev(info.indev, ft))
+ return;
+
+ route->tuple[!dir].in.ifindex = info.indev->ifindex;
+}
+
static int nft_flow_route(const struct nft_pktinfo *pkt,
const struct nf_conn *ct,
struct nf_flow_route *route,
- enum ip_conntrack_dir dir)
+ enum ip_conntrack_dir dir,
+ struct nft_flowtable *ft)
{
struct dst_entry *this_dst = skb_dst(pkt->skb);
struct dst_entry *other_dst = NULL;
@@ -63,6 +153,12 @@ static int nft_flow_route(const struct n
nft_default_forward_path(route, this_dst, dir);
nft_default_forward_path(route, other_dst, !dir);
+ if (route->tuple[dir].xmit_type == FLOW_OFFLOAD_XMIT_NEIGH &&
+ route->tuple[!dir].xmit_type == FLOW_OFFLOAD_XMIT_NEIGH) {
+ nft_dev_forward_path(route, ct, dir, ft);
+ nft_dev_forward_path(route, ct, !dir, ft);
+ }
+
return 0;
}
@@ -90,8 +186,8 @@ static void nft_flow_offload_eval(const
struct nft_flow_offload *priv = nft_expr_priv(expr);
struct nf_flowtable *flowtable = &priv->flowtable->data;
struct tcphdr _tcph, *tcph = NULL;
+ struct nf_flow_route route = {};
enum ip_conntrack_info ctinfo;
- struct nf_flow_route route;
struct flow_offload *flow;
enum ip_conntrack_dir dir;
struct nf_conn *ct;
@@ -128,7 +224,7 @@ static void nft_flow_offload_eval(const
goto out;
dir = CTINFO2DIR(ctinfo);
- if (nft_flow_route(pkt, ct, &route, dir) < 0)
+ if (nft_flow_route(pkt, ct, &route, dir, priv->flowtable) < 0)
goto err_flow_route;
flow = flow_offload_alloc(ct);

View File

@ -1,374 +0,0 @@
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Wed, 24 Mar 2021 02:30:40 +0100
Subject: [PATCH] netfilter: flowtable: use dev_fill_forward_path() to
obtain egress device
The egress device in the tuple is obtained from route. Use
dev_fill_forward_path() instead to provide the real egress device for
this flow whenever this is available.
The new FLOW_OFFLOAD_XMIT_DIRECT type uses dev_queue_xmit() to transmit
ethernet frames. Cache the source and destination hardware address to
use dev_queue_xmit() to transfer packets.
The FLOW_OFFLOAD_XMIT_DIRECT replaces FLOW_OFFLOAD_XMIT_NEIGH if
dev_fill_forward_path() finds a direct transmit path.
In case of topology updates, if peer is moved to different bridge port,
the connection will time out, reconnect will result in a new entry with
the correct path. Snooping fdb updates would allow for cleaning up stale
flowtable entries.
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
--- a/include/net/netfilter/nf_flow_table.h
+++ b/include/net/netfilter/nf_flow_table.h
@@ -92,6 +92,7 @@ enum flow_offload_tuple_dir {
enum flow_offload_xmit_type {
FLOW_OFFLOAD_XMIT_NEIGH = 0,
FLOW_OFFLOAD_XMIT_XFRM,
+ FLOW_OFFLOAD_XMIT_DIRECT,
};
struct flow_offload_tuple {
@@ -120,8 +121,14 @@ struct flow_offload_tuple {
xmit_type:2;
u16 mtu;
-
- struct dst_entry *dst_cache;
+ union {
+ struct dst_entry *dst_cache;
+ struct {
+ u32 ifidx;
+ u8 h_source[ETH_ALEN];
+ u8 h_dest[ETH_ALEN];
+ } out;
+ };
};
struct flow_offload_tuple_rhash {
@@ -167,6 +174,11 @@ struct nf_flow_route {
struct {
u32 ifindex;
} in;
+ struct {
+ u32 ifindex;
+ u8 h_source[ETH_ALEN];
+ u8 h_dest[ETH_ALEN];
+ } out;
enum flow_offload_xmit_type xmit_type;
} tuple[FLOW_OFFLOAD_DIR_MAX];
};
--- a/net/netfilter/nf_flow_table_core.c
+++ b/net/netfilter/nf_flow_table_core.c
@@ -81,9 +81,6 @@ static int flow_offload_fill_route(struc
struct flow_offload_tuple *flow_tuple = &flow->tuplehash[dir].tuple;
struct dst_entry *dst = route->tuple[dir].dst;
- if (!dst_hold_safe(route->tuple[dir].dst))
- return -1;
-
switch (flow_tuple->l3proto) {
case NFPROTO_IPV4:
flow_tuple->mtu = ip_dst_mtu_maybe_forward(dst, true);
@@ -94,12 +91,36 @@ static int flow_offload_fill_route(struc
}
flow_tuple->iifidx = route->tuple[dir].in.ifindex;
+
+ switch (route->tuple[dir].xmit_type) {
+ case FLOW_OFFLOAD_XMIT_DIRECT:
+ memcpy(flow_tuple->out.h_dest, route->tuple[dir].out.h_dest,
+ ETH_ALEN);
+ memcpy(flow_tuple->out.h_source, route->tuple[dir].out.h_source,
+ ETH_ALEN);
+ flow_tuple->out.ifidx = route->tuple[dir].out.ifindex;
+ break;
+ case FLOW_OFFLOAD_XMIT_XFRM:
+ case FLOW_OFFLOAD_XMIT_NEIGH:
+ if (!dst_hold_safe(route->tuple[dir].dst))
+ return -1;
+
+ flow_tuple->dst_cache = dst;
+ break;
+ }
flow_tuple->xmit_type = route->tuple[dir].xmit_type;
- flow_tuple->dst_cache = dst;
return 0;
}
+static void nft_flow_dst_release(struct flow_offload *flow,
+ enum flow_offload_tuple_dir dir)
+{
+ if (flow->tuplehash[dir].tuple.xmit_type == FLOW_OFFLOAD_XMIT_NEIGH ||
+ flow->tuplehash[dir].tuple.xmit_type == FLOW_OFFLOAD_XMIT_XFRM)
+ dst_release(flow->tuplehash[dir].tuple.dst_cache);
+}
+
int flow_offload_route_init(struct flow_offload *flow,
const struct nf_flow_route *route)
{
@@ -118,7 +139,7 @@ int flow_offload_route_init(struct flow_
return 0;
err_route_reply:
- dst_release(route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].dst);
+ nft_flow_dst_release(flow, FLOW_OFFLOAD_DIR_ORIGINAL);
return err;
}
@@ -169,8 +190,8 @@ static void flow_offload_fixup_ct(struct
static void flow_offload_route_release(struct flow_offload *flow)
{
- dst_release(flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_cache);
- dst_release(flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_cache);
+ nft_flow_dst_release(flow, FLOW_OFFLOAD_DIR_ORIGINAL);
+ nft_flow_dst_release(flow, FLOW_OFFLOAD_DIR_REPLY);
}
void flow_offload_free(struct flow_offload *flow)
--- a/net/netfilter/nf_flow_table_ip.c
+++ b/net/netfilter/nf_flow_table_ip.c
@@ -207,6 +207,24 @@ static unsigned int nf_flow_xmit_xfrm(st
return NF_STOLEN;
}
+static unsigned int nf_flow_queue_xmit(struct net *net, struct sk_buff *skb,
+ const struct flow_offload_tuple_rhash *tuplehash,
+ unsigned short type)
+{
+ struct net_device *outdev;
+
+ outdev = dev_get_by_index_rcu(net, tuplehash->tuple.out.ifidx);
+ if (!outdev)
+ return NF_DROP;
+
+ skb->dev = outdev;
+ dev_hard_header(skb, skb->dev, type, tuplehash->tuple.out.h_dest,
+ tuplehash->tuple.out.h_source, skb->len);
+ dev_queue_xmit(skb);
+
+ return NF_STOLEN;
+}
+
unsigned int
nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state)
@@ -222,6 +240,7 @@ nf_flow_offload_ip_hook(void *priv, stru
struct iphdr *iph;
__be32 nexthop;
u32 hdrsize;
+ int ret;
if (skb->protocol != htons(ETH_P_IP))
return NF_ACCEPT;
@@ -244,9 +263,13 @@ nf_flow_offload_ip_hook(void *priv, stru
if (nf_flow_state_check(flow, iph->protocol, skb, thoff))
return NF_ACCEPT;
- if (!dst_check(&rt->dst, 0)) {
- flow_offload_teardown(flow);
- return NF_ACCEPT;
+ if (tuplehash->tuple.xmit_type == FLOW_OFFLOAD_XMIT_NEIGH ||
+ tuplehash->tuple.xmit_type == FLOW_OFFLOAD_XMIT_XFRM) {
+ rt = (struct rtable *)tuplehash->tuple.dst_cache;
+ if (!dst_check(&rt->dst, 0)) {
+ flow_offload_teardown(flow);
+ return NF_ACCEPT;
+ }
}
if (skb_try_make_writable(skb, thoff + hdrsize))
@@ -263,8 +286,6 @@ nf_flow_offload_ip_hook(void *priv, stru
if (flow_table->flags & NF_FLOWTABLE_COUNTER)
nf_ct_acct_update(flow->ct, tuplehash->tuple.dir, skb->len);
- rt = (struct rtable *)tuplehash->tuple.dst_cache;
-
if (unlikely(tuplehash->tuple.xmit_type == FLOW_OFFLOAD_XMIT_XFRM)) {
memset(skb->cb, 0, sizeof(struct inet_skb_parm));
IPCB(skb)->iif = skb->dev->ifindex;
@@ -272,13 +293,23 @@ nf_flow_offload_ip_hook(void *priv, stru
return nf_flow_xmit_xfrm(skb, state, &rt->dst);
}
- outdev = rt->dst.dev;
- skb->dev = outdev;
- nexthop = rt_nexthop(rt, flow->tuplehash[!dir].tuple.src_v4.s_addr);
- skb_dst_set_noref(skb, &rt->dst);
- neigh_xmit(NEIGH_ARP_TABLE, outdev, &nexthop, skb);
+ switch (tuplehash->tuple.xmit_type) {
+ case FLOW_OFFLOAD_XMIT_NEIGH:
+ outdev = rt->dst.dev;
+ skb->dev = outdev;
+ nexthop = rt_nexthop(rt, flow->tuplehash[!dir].tuple.src_v4.s_addr);
+ skb_dst_set_noref(skb, &rt->dst);
+ neigh_xmit(NEIGH_ARP_TABLE, outdev, &nexthop, skb);
+ ret = NF_STOLEN;
+ break;
+ case FLOW_OFFLOAD_XMIT_DIRECT:
+ ret = nf_flow_queue_xmit(state->net, skb, tuplehash, ETH_P_IP);
+ if (ret == NF_DROP)
+ flow_offload_teardown(flow);
+ break;
+ }
- return NF_STOLEN;
+ return ret;
}
EXPORT_SYMBOL_GPL(nf_flow_offload_ip_hook);
@@ -444,6 +475,7 @@ nf_flow_offload_ipv6_hook(void *priv, st
struct ipv6hdr *ip6h;
struct rt6_info *rt;
u32 hdrsize;
+ int ret;
if (skb->protocol != htons(ETH_P_IPV6))
return NF_ACCEPT;
@@ -465,9 +497,13 @@ nf_flow_offload_ipv6_hook(void *priv, st
sizeof(*ip6h)))
return NF_ACCEPT;
- if (!dst_check(&rt->dst, 0)) {
- flow_offload_teardown(flow);
- return NF_ACCEPT;
+ if (tuplehash->tuple.xmit_type == FLOW_OFFLOAD_XMIT_NEIGH ||
+ tuplehash->tuple.xmit_type == FLOW_OFFLOAD_XMIT_XFRM) {
+ rt = (struct rt6_info *)tuplehash->tuple.dst_cache;
+ if (!dst_check(&rt->dst, 0)) {
+ flow_offload_teardown(flow);
+ return NF_ACCEPT;
+ }
}
if (skb_try_make_writable(skb, sizeof(*ip6h) + hdrsize))
@@ -484,8 +520,6 @@ nf_flow_offload_ipv6_hook(void *priv, st
if (flow_table->flags & NF_FLOWTABLE_COUNTER)
nf_ct_acct_update(flow->ct, tuplehash->tuple.dir, skb->len);
- rt = (struct rt6_info *)tuplehash->tuple.dst_cache;
-
if (unlikely(tuplehash->tuple.xmit_type == FLOW_OFFLOAD_XMIT_XFRM)) {
memset(skb->cb, 0, sizeof(struct inet6_skb_parm));
IP6CB(skb)->iif = skb->dev->ifindex;
@@ -493,12 +527,22 @@ nf_flow_offload_ipv6_hook(void *priv, st
return nf_flow_xmit_xfrm(skb, state, &rt->dst);
}
- outdev = rt->dst.dev;
- skb->dev = outdev;
- nexthop = rt6_nexthop(rt, &flow->tuplehash[!dir].tuple.src_v6);
- skb_dst_set_noref(skb, &rt->dst);
- neigh_xmit(NEIGH_ND_TABLE, outdev, nexthop, skb);
+ switch (tuplehash->tuple.xmit_type) {
+ case FLOW_OFFLOAD_XMIT_NEIGH:
+ outdev = rt->dst.dev;
+ skb->dev = outdev;
+ nexthop = rt6_nexthop(rt, &flow->tuplehash[!dir].tuple.src_v6);
+ skb_dst_set_noref(skb, &rt->dst);
+ neigh_xmit(NEIGH_ND_TABLE, outdev, nexthop, skb);
+ ret = NF_STOLEN;
+ break;
+ case FLOW_OFFLOAD_XMIT_DIRECT:
+ ret = nf_flow_queue_xmit(state->net, skb, tuplehash, ETH_P_IPV6);
+ if (ret == NF_DROP)
+ flow_offload_teardown(flow);
+ break;
+ }
- return NF_STOLEN;
+ return ret;
}
EXPORT_SYMBOL_GPL(nf_flow_offload_ipv6_hook);
--- a/net/netfilter/nft_flow_offload.c
+++ b/net/netfilter/nft_flow_offload.c
@@ -39,12 +39,11 @@ static void nft_default_forward_path(str
static int nft_dev_fill_forward_path(const struct nf_flow_route *route,
const struct dst_entry *dst_cache,
const struct nf_conn *ct,
- enum ip_conntrack_dir dir,
+ enum ip_conntrack_dir dir, u8 *ha,
struct net_device_path_stack *stack)
{
const void *daddr = &ct->tuplehash[!dir].tuple.src.u3;
struct net_device *dev = dst_cache->dev;
- unsigned char ha[ETH_ALEN];
struct neighbour *n;
u8 nud_state;
@@ -66,27 +65,43 @@ static int nft_dev_fill_forward_path(con
struct nft_forward_info {
const struct net_device *indev;
+ const struct net_device *outdev;
+ u8 h_source[ETH_ALEN];
+ u8 h_dest[ETH_ALEN];
+ enum flow_offload_xmit_type xmit_type;
};
static void nft_dev_path_info(const struct net_device_path_stack *stack,
- struct nft_forward_info *info)
+ struct nft_forward_info *info,
+ unsigned char *ha)
{
const struct net_device_path *path;
int i;
+ memcpy(info->h_dest, ha, ETH_ALEN);
+
for (i = 0; i < stack->num_paths; i++) {
path = &stack->path[i];
switch (path->type) {
case DEV_PATH_ETHERNET:
info->indev = path->dev;
+ if (is_zero_ether_addr(info->h_source))
+ memcpy(info->h_source, path->dev->dev_addr, ETH_ALEN);
break;
- case DEV_PATH_VLAN:
case DEV_PATH_BRIDGE:
+ if (is_zero_ether_addr(info->h_source))
+ memcpy(info->h_source, path->dev->dev_addr, ETH_ALEN);
+
+ info->xmit_type = FLOW_OFFLOAD_XMIT_DIRECT;
+ break;
+ case DEV_PATH_VLAN:
default:
info->indev = NULL;
break;
}
}
+ if (!info->outdev)
+ info->outdev = info->indev;
}
static bool nft_flowtable_find_dev(const struct net_device *dev,
@@ -114,14 +129,22 @@ static void nft_dev_forward_path(struct
const struct dst_entry *dst = route->tuple[dir].dst;
struct net_device_path_stack stack;
struct nft_forward_info info = {};
+ unsigned char ha[ETH_ALEN];
- if (nft_dev_fill_forward_path(route, dst, ct, dir, &stack) >= 0)
- nft_dev_path_info(&stack, &info);
+ if (nft_dev_fill_forward_path(route, dst, ct, dir, ha, &stack) >= 0)
+ nft_dev_path_info(&stack, &info, ha);
if (!info.indev || !nft_flowtable_find_dev(info.indev, ft))
return;
route->tuple[!dir].in.ifindex = info.indev->ifindex;
+
+ if (info.xmit_type == FLOW_OFFLOAD_XMIT_DIRECT) {
+ memcpy(route->tuple[dir].out.h_source, info.h_source, ETH_ALEN);
+ memcpy(route->tuple[dir].out.h_dest, info.h_dest, ETH_ALEN);
+ route->tuple[dir].out.ifindex = info.outdev->ifindex;
+ route->tuple[dir].xmit_type = info.xmit_type;
+ }
}
static int nft_flow_route(const struct nft_pktinfo *pkt,

View File

@ -1,410 +0,0 @@
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Wed, 24 Mar 2021 02:30:41 +0100
Subject: [PATCH] netfilter: flowtable: add vlan support
Add the vlan id and protocol to the flow tuple to uniquely identify
flows from the receive path. For the transmit path, dev_hard_header() on
the vlan device push the headers. This patch includes support for two
vlan headers (QinQ) from the ingress path.
Add a generic encap field to the flowtable entry which stores the
protocol and the tag id. This allows to reuse these fields in the PPPoE
support coming in a later patch.
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
--- a/include/net/netfilter/nf_flow_table.h
+++ b/include/net/netfilter/nf_flow_table.h
@@ -95,6 +95,8 @@ enum flow_offload_xmit_type {
FLOW_OFFLOAD_XMIT_DIRECT,
};
+#define NF_FLOW_TABLE_ENCAP_MAX 2
+
struct flow_offload_tuple {
union {
struct in_addr src_v4;
@@ -113,13 +115,17 @@ struct flow_offload_tuple {
u8 l3proto;
u8 l4proto;
+ struct {
+ u16 id;
+ __be16 proto;
+ } encap[NF_FLOW_TABLE_ENCAP_MAX];
/* All members above are keys for lookups, see flow_offload_hash(). */
struct { } __hash;
- u8 dir:6,
- xmit_type:2;
-
+ u8 dir:4,
+ xmit_type:2,
+ encap_num:2;
u16 mtu;
union {
struct dst_entry *dst_cache;
@@ -173,6 +179,11 @@ struct nf_flow_route {
struct dst_entry *dst;
struct {
u32 ifindex;
+ struct {
+ u16 id;
+ __be16 proto;
+ } encap[NF_FLOW_TABLE_ENCAP_MAX];
+ u8 num_encaps;
} in;
struct {
u32 ifindex;
--- a/net/netfilter/nf_flow_table_core.c
+++ b/net/netfilter/nf_flow_table_core.c
@@ -80,6 +80,7 @@ static int flow_offload_fill_route(struc
{
struct flow_offload_tuple *flow_tuple = &flow->tuplehash[dir].tuple;
struct dst_entry *dst = route->tuple[dir].dst;
+ int i, j = 0;
switch (flow_tuple->l3proto) {
case NFPROTO_IPV4:
@@ -91,6 +92,12 @@ static int flow_offload_fill_route(struc
}
flow_tuple->iifidx = route->tuple[dir].in.ifindex;
+ for (i = route->tuple[dir].in.num_encaps - 1; i >= 0; i--) {
+ flow_tuple->encap[j].id = route->tuple[dir].in.encap[i].id;
+ flow_tuple->encap[j].proto = route->tuple[dir].in.encap[i].proto;
+ j++;
+ }
+ flow_tuple->encap_num = route->tuple[dir].in.num_encaps;
switch (route->tuple[dir].xmit_type) {
case FLOW_OFFLOAD_XMIT_DIRECT:
--- a/net/netfilter/nf_flow_table_ip.c
+++ b/net/netfilter/nf_flow_table_ip.c
@@ -136,23 +136,44 @@ static bool ip_has_options(unsigned int
return thoff != sizeof(struct iphdr);
}
+static void nf_flow_tuple_encap(struct sk_buff *skb,
+ struct flow_offload_tuple *tuple)
+{
+ int i = 0;
+
+ if (skb_vlan_tag_present(skb)) {
+ tuple->encap[i].id = skb_vlan_tag_get(skb);
+ tuple->encap[i].proto = skb->vlan_proto;
+ i++;
+ }
+ if (skb->protocol == htons(ETH_P_8021Q)) {
+ struct vlan_ethhdr *veth = (struct vlan_ethhdr *)skb_mac_header(skb);
+
+ tuple->encap[i].id = ntohs(veth->h_vlan_TCI);
+ tuple->encap[i].proto = skb->protocol;
+ }
+}
+
static int nf_flow_tuple_ip(struct sk_buff *skb, const struct net_device *dev,
- struct flow_offload_tuple *tuple, u32 *hdrsize)
+ struct flow_offload_tuple *tuple, u32 *hdrsize,
+ u32 offset)
{
struct flow_ports *ports;
unsigned int thoff;
struct iphdr *iph;
- if (!pskb_may_pull(skb, sizeof(*iph)))
+ if (!pskb_may_pull(skb, sizeof(*iph) + offset))
return -1;
- iph = ip_hdr(skb);
- thoff = iph->ihl * 4;
+ iph = (struct iphdr *)(skb_network_header(skb) + offset);
+ thoff = (iph->ihl * 4);
if (ip_is_fragment(iph) ||
unlikely(ip_has_options(thoff)))
return -1;
+ thoff += offset;
+
switch (iph->protocol) {
case IPPROTO_TCP:
*hdrsize = sizeof(struct tcphdr);
@@ -167,11 +188,10 @@ static int nf_flow_tuple_ip(struct sk_bu
if (iph->ttl <= 1)
return -1;
- thoff = iph->ihl * 4;
if (!pskb_may_pull(skb, thoff + *hdrsize))
return -1;
- iph = ip_hdr(skb);
+ iph = (struct iphdr *)(skb_network_header(skb) + offset);
ports = (struct flow_ports *)(skb_network_header(skb) + thoff);
tuple->src_v4.s_addr = iph->saddr;
@@ -181,6 +201,7 @@ static int nf_flow_tuple_ip(struct sk_bu
tuple->l3proto = AF_INET;
tuple->l4proto = iph->protocol;
tuple->iifidx = dev->ifindex;
+ nf_flow_tuple_encap(skb, tuple);
return 0;
}
@@ -207,6 +228,43 @@ static unsigned int nf_flow_xmit_xfrm(st
return NF_STOLEN;
}
+static bool nf_flow_skb_encap_protocol(const struct sk_buff *skb, __be16 proto,
+ u32 *offset)
+{
+ if (skb->protocol == htons(ETH_P_8021Q)) {
+ struct vlan_ethhdr *veth;
+
+ veth = (struct vlan_ethhdr *)skb_mac_header(skb);
+ if (veth->h_vlan_encapsulated_proto == proto) {
+ *offset += VLAN_HLEN;
+ return true;
+ }
+ }
+
+ return false;
+}
+
+static void nf_flow_encap_pop(struct sk_buff *skb,
+ struct flow_offload_tuple_rhash *tuplehash)
+{
+ struct vlan_hdr *vlan_hdr;
+ int i;
+
+ for (i = 0; i < tuplehash->tuple.encap_num; i++) {
+ if (skb_vlan_tag_present(skb)) {
+ __vlan_hwaccel_clear_tag(skb);
+ continue;
+ }
+ if (skb->protocol == htons(ETH_P_8021Q)) {
+ vlan_hdr = (struct vlan_hdr *)skb->data;
+ __skb_pull(skb, VLAN_HLEN);
+ vlan_set_encap_proto(skb, vlan_hdr);
+ skb_reset_network_header(skb);
+ break;
+ }
+ }
+}
+
static unsigned int nf_flow_queue_xmit(struct net *net, struct sk_buff *skb,
const struct flow_offload_tuple_rhash *tuplehash,
unsigned short type)
@@ -235,17 +293,18 @@ nf_flow_offload_ip_hook(void *priv, stru
enum flow_offload_tuple_dir dir;
struct flow_offload *flow;
struct net_device *outdev;
+ u32 hdrsize, offset = 0;
+ unsigned int thoff, mtu;
struct rtable *rt;
- unsigned int thoff;
struct iphdr *iph;
__be32 nexthop;
- u32 hdrsize;
int ret;
- if (skb->protocol != htons(ETH_P_IP))
+ if (skb->protocol != htons(ETH_P_IP) &&
+ !nf_flow_skb_encap_protocol(skb, htons(ETH_P_IP), &offset))
return NF_ACCEPT;
- if (nf_flow_tuple_ip(skb, state->in, &tuple, &hdrsize) < 0)
+ if (nf_flow_tuple_ip(skb, state->in, &tuple, &hdrsize, offset) < 0)
return NF_ACCEPT;
tuplehash = flow_offload_lookup(flow_table, &tuple);
@@ -255,11 +314,12 @@ nf_flow_offload_ip_hook(void *priv, stru
dir = tuplehash->tuple.dir;
flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
- if (unlikely(nf_flow_exceeds_mtu(skb, flow->tuplehash[dir].tuple.mtu)))
+ mtu = flow->tuplehash[dir].tuple.mtu + offset;
+ if (unlikely(nf_flow_exceeds_mtu(skb, mtu)))
return NF_ACCEPT;
- iph = ip_hdr(skb);
- thoff = iph->ihl * 4;
+ iph = (struct iphdr *)(skb_network_header(skb) + offset);
+ thoff = (iph->ihl * 4) + offset;
if (nf_flow_state_check(flow, iph->protocol, skb, thoff))
return NF_ACCEPT;
@@ -277,6 +337,9 @@ nf_flow_offload_ip_hook(void *priv, stru
flow_offload_refresh(flow_table, flow);
+ nf_flow_encap_pop(skb, tuplehash);
+ thoff -= offset;
+
iph = ip_hdr(skb);
nf_flow_nat_ip(flow, skb, thoff, dir, iph);
@@ -418,16 +481,18 @@ static void nf_flow_nat_ipv6(const struc
}
static int nf_flow_tuple_ipv6(struct sk_buff *skb, const struct net_device *dev,
- struct flow_offload_tuple *tuple, u32 *hdrsize)
+ struct flow_offload_tuple *tuple, u32 *hdrsize,
+ u32 offset)
{
struct flow_ports *ports;
struct ipv6hdr *ip6h;
unsigned int thoff;
- if (!pskb_may_pull(skb, sizeof(*ip6h)))
+ thoff = sizeof(*ip6h) + offset;
+ if (!pskb_may_pull(skb, thoff))
return -1;
- ip6h = ipv6_hdr(skb);
+ ip6h = (struct ipv6hdr *)(skb_network_header(skb) + offset);
switch (ip6h->nexthdr) {
case IPPROTO_TCP:
@@ -443,11 +508,10 @@ static int nf_flow_tuple_ipv6(struct sk_
if (ip6h->hop_limit <= 1)
return -1;
- thoff = sizeof(*ip6h);
if (!pskb_may_pull(skb, thoff + *hdrsize))
return -1;
- ip6h = ipv6_hdr(skb);
+ ip6h = (struct ipv6hdr *)(skb_network_header(skb) + offset);
ports = (struct flow_ports *)(skb_network_header(skb) + thoff);
tuple->src_v6 = ip6h->saddr;
@@ -457,6 +521,7 @@ static int nf_flow_tuple_ipv6(struct sk_
tuple->l3proto = AF_INET6;
tuple->l4proto = ip6h->nexthdr;
tuple->iifidx = dev->ifindex;
+ nf_flow_tuple_encap(skb, tuple);
return 0;
}
@@ -472,15 +537,17 @@ nf_flow_offload_ipv6_hook(void *priv, st
const struct in6_addr *nexthop;
struct flow_offload *flow;
struct net_device *outdev;
+ unsigned int thoff, mtu;
+ u32 hdrsize, offset = 0;
struct ipv6hdr *ip6h;
struct rt6_info *rt;
- u32 hdrsize;
int ret;
- if (skb->protocol != htons(ETH_P_IPV6))
+ if (skb->protocol != htons(ETH_P_IPV6) &&
+ !nf_flow_skb_encap_protocol(skb, htons(ETH_P_IPV6), &offset))
return NF_ACCEPT;
- if (nf_flow_tuple_ipv6(skb, state->in, &tuple, &hdrsize) < 0)
+ if (nf_flow_tuple_ipv6(skb, state->in, &tuple, &hdrsize, offset) < 0)
return NF_ACCEPT;
tuplehash = flow_offload_lookup(flow_table, &tuple);
@@ -490,11 +557,13 @@ nf_flow_offload_ipv6_hook(void *priv, st
dir = tuplehash->tuple.dir;
flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
- if (unlikely(nf_flow_exceeds_mtu(skb, flow->tuplehash[dir].tuple.mtu)))
+ mtu = flow->tuplehash[dir].tuple.mtu + offset;
+ if (unlikely(nf_flow_exceeds_mtu(skb, mtu)))
return NF_ACCEPT;
- if (nf_flow_state_check(flow, ipv6_hdr(skb)->nexthdr, skb,
- sizeof(*ip6h)))
+ ip6h = (struct ipv6hdr *)(skb_network_header(skb) + offset);
+ thoff = sizeof(*ip6h) + offset;
+ if (nf_flow_state_check(flow, ip6h->nexthdr, skb, thoff))
return NF_ACCEPT;
if (tuplehash->tuple.xmit_type == FLOW_OFFLOAD_XMIT_NEIGH ||
@@ -506,11 +575,13 @@ nf_flow_offload_ipv6_hook(void *priv, st
}
}
- if (skb_try_make_writable(skb, sizeof(*ip6h) + hdrsize))
+ if (skb_try_make_writable(skb, thoff + hdrsize))
return NF_DROP;
flow_offload_refresh(flow_table, flow);
+ nf_flow_encap_pop(skb, tuplehash);
+
ip6h = ipv6_hdr(skb);
nf_flow_nat_ipv6(flow, skb, dir, ip6h);
--- a/net/netfilter/nft_flow_offload.c
+++ b/net/netfilter/nft_flow_offload.c
@@ -66,6 +66,11 @@ static int nft_dev_fill_forward_path(con
struct nft_forward_info {
const struct net_device *indev;
const struct net_device *outdev;
+ struct id {
+ __u16 id;
+ __be16 proto;
+ } encap[NF_FLOW_TABLE_ENCAP_MAX];
+ u8 num_encaps;
u8 h_source[ETH_ALEN];
u8 h_dest[ETH_ALEN];
enum flow_offload_xmit_type xmit_type;
@@ -84,9 +89,23 @@ static void nft_dev_path_info(const stru
path = &stack->path[i];
switch (path->type) {
case DEV_PATH_ETHERNET:
+ case DEV_PATH_VLAN:
info->indev = path->dev;
if (is_zero_ether_addr(info->h_source))
memcpy(info->h_source, path->dev->dev_addr, ETH_ALEN);
+
+ if (path->type == DEV_PATH_ETHERNET)
+ break;
+
+ /* DEV_PATH_VLAN */
+ if (info->num_encaps >= NF_FLOW_TABLE_ENCAP_MAX) {
+ info->indev = NULL;
+ break;
+ }
+ info->outdev = path->dev;
+ info->encap[info->num_encaps].id = path->encap.id;
+ info->encap[info->num_encaps].proto = path->encap.proto;
+ info->num_encaps++;
break;
case DEV_PATH_BRIDGE:
if (is_zero_ether_addr(info->h_source))
@@ -94,7 +113,6 @@ static void nft_dev_path_info(const stru
info->xmit_type = FLOW_OFFLOAD_XMIT_DIRECT;
break;
- case DEV_PATH_VLAN:
default:
info->indev = NULL;
break;
@@ -130,6 +148,7 @@ static void nft_dev_forward_path(struct
struct net_device_path_stack stack;
struct nft_forward_info info = {};
unsigned char ha[ETH_ALEN];
+ int i;
if (nft_dev_fill_forward_path(route, dst, ct, dir, ha, &stack) >= 0)
nft_dev_path_info(&stack, &info, ha);
@@ -138,6 +157,11 @@ static void nft_dev_forward_path(struct
return;
route->tuple[!dir].in.ifindex = info.indev->ifindex;
+ for (i = 0; i < info.num_encaps; i++) {
+ route->tuple[!dir].in.encap[i].id = info.encap[i].id;
+ route->tuple[!dir].in.encap[i].proto = info.encap[i].proto;
+ }
+ route->tuple[!dir].in.num_encaps = info.num_encaps;
if (info.xmit_type == FLOW_OFFLOAD_XMIT_DIRECT) {
memcpy(route->tuple[dir].out.h_source, info.h_source, ETH_ALEN);

View File

@ -1,30 +0,0 @@
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Wed, 24 Mar 2021 02:30:42 +0100
Subject: [PATCH] netfilter: flowtable: add bridge vlan filtering support
Add the vlan tag based when PVID is set on.
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
--- a/net/netfilter/nft_flow_offload.c
+++ b/net/netfilter/nft_flow_offload.c
@@ -111,6 +111,18 @@ static void nft_dev_path_info(const stru
if (is_zero_ether_addr(info->h_source))
memcpy(info->h_source, path->dev->dev_addr, ETH_ALEN);
+ switch (path->bridge.vlan_mode) {
+ case DEV_PATH_BR_VLAN_TAG:
+ info->encap[info->num_encaps].id = path->bridge.vlan_id;
+ info->encap[info->num_encaps].proto = path->bridge.vlan_proto;
+ info->num_encaps++;
+ break;
+ case DEV_PATH_BR_VLAN_UNTAG:
+ info->num_encaps--;
+ break;
+ case DEV_PATH_BR_VLAN_KEEP:
+ break;
+ }
info->xmit_type = FLOW_OFFLOAD_XMIT_DIRECT;
break;
default:

View File

@ -1,145 +0,0 @@
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Wed, 24 Mar 2021 02:30:43 +0100
Subject: [PATCH] netfilter: flowtable: add pppoe support
Add the PPPoE protocol and session id to the flow tuple using the encap
fields to uniquely identify flows from the receive path. For the
transmit path, dev_hard_header() on the vlan device push the headers.
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
--- a/net/netfilter/nf_flow_table_ip.c
+++ b/net/netfilter/nf_flow_table_ip.c
@@ -7,6 +7,9 @@
#include <linux/ip.h>
#include <linux/ipv6.h>
#include <linux/netdevice.h>
+#include <linux/if_ether.h>
+#include <linux/if_pppox.h>
+#include <linux/ppp_defs.h>
#include <net/ip.h>
#include <net/ipv6.h>
#include <net/ip6_route.h>
@@ -139,6 +142,8 @@ static bool ip_has_options(unsigned int
static void nf_flow_tuple_encap(struct sk_buff *skb,
struct flow_offload_tuple *tuple)
{
+ struct vlan_ethhdr *veth;
+ struct pppoe_hdr *phdr;
int i = 0;
if (skb_vlan_tag_present(skb)) {
@@ -146,11 +151,17 @@ static void nf_flow_tuple_encap(struct s
tuple->encap[i].proto = skb->vlan_proto;
i++;
}
- if (skb->protocol == htons(ETH_P_8021Q)) {
- struct vlan_ethhdr *veth = (struct vlan_ethhdr *)skb_mac_header(skb);
-
+ switch (skb->protocol) {
+ case htons(ETH_P_8021Q):
+ veth = (struct vlan_ethhdr *)skb_mac_header(skb);
tuple->encap[i].id = ntohs(veth->h_vlan_TCI);
tuple->encap[i].proto = skb->protocol;
+ break;
+ case htons(ETH_P_PPP_SES):
+ phdr = (struct pppoe_hdr *)skb_mac_header(skb);
+ tuple->encap[i].id = ntohs(phdr->sid);
+ tuple->encap[i].proto = skb->protocol;
+ break;
}
}
@@ -228,17 +239,41 @@ static unsigned int nf_flow_xmit_xfrm(st
return NF_STOLEN;
}
+static inline __be16 nf_flow_pppoe_proto(const struct sk_buff *skb)
+{
+ __be16 proto;
+
+ proto = *((__be16 *)(skb_mac_header(skb) + ETH_HLEN +
+ sizeof(struct pppoe_hdr)));
+ switch (proto) {
+ case htons(PPP_IP):
+ return htons(ETH_P_IP);
+ case htons(PPP_IPV6):
+ return htons(ETH_P_IPV6);
+ }
+
+ return 0;
+}
+
static bool nf_flow_skb_encap_protocol(const struct sk_buff *skb, __be16 proto,
u32 *offset)
{
- if (skb->protocol == htons(ETH_P_8021Q)) {
- struct vlan_ethhdr *veth;
+ struct vlan_ethhdr *veth;
+ switch (skb->protocol) {
+ case htons(ETH_P_8021Q):
veth = (struct vlan_ethhdr *)skb_mac_header(skb);
if (veth->h_vlan_encapsulated_proto == proto) {
*offset += VLAN_HLEN;
return true;
}
+ break;
+ case htons(ETH_P_PPP_SES):
+ if (nf_flow_pppoe_proto(skb) == proto) {
+ *offset += PPPOE_SES_HLEN;
+ return true;
+ }
+ break;
}
return false;
@@ -255,12 +290,18 @@ static void nf_flow_encap_pop(struct sk_
__vlan_hwaccel_clear_tag(skb);
continue;
}
- if (skb->protocol == htons(ETH_P_8021Q)) {
+ switch (skb->protocol) {
+ case htons(ETH_P_8021Q):
vlan_hdr = (struct vlan_hdr *)skb->data;
__skb_pull(skb, VLAN_HLEN);
vlan_set_encap_proto(skb, vlan_hdr);
skb_reset_network_header(skb);
break;
+ case htons(ETH_P_PPP_SES):
+ skb->protocol = nf_flow_pppoe_proto(skb);
+ skb_pull(skb, PPPOE_SES_HLEN);
+ skb_reset_network_header(skb);
+ break;
}
}
}
--- a/net/netfilter/nft_flow_offload.c
+++ b/net/netfilter/nft_flow_offload.c
@@ -90,6 +90,7 @@ static void nft_dev_path_info(const stru
switch (path->type) {
case DEV_PATH_ETHERNET:
case DEV_PATH_VLAN:
+ case DEV_PATH_PPPOE:
info->indev = path->dev;
if (is_zero_ether_addr(info->h_source))
memcpy(info->h_source, path->dev->dev_addr, ETH_ALEN);
@@ -97,7 +98,7 @@ static void nft_dev_path_info(const stru
if (path->type == DEV_PATH_ETHERNET)
break;
- /* DEV_PATH_VLAN */
+ /* DEV_PATH_VLAN and DEV_PATH_PPPOE */
if (info->num_encaps >= NF_FLOW_TABLE_ENCAP_MAX) {
info->indev = NULL;
break;
@@ -106,6 +107,8 @@ static void nft_dev_path_info(const stru
info->encap[info->num_encaps].id = path->encap.id;
info->encap[info->num_encaps].proto = path->encap.proto;
info->num_encaps++;
+ if (path->type == DEV_PATH_PPPOE)
+ memcpy(info->h_dest, path->encap.h_dest, ETH_ALEN);
break;
case DEV_PATH_BRIDGE:
if (is_zero_ether_addr(info->h_source))

View File

@ -1,32 +0,0 @@
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Wed, 24 Mar 2021 02:30:44 +0100
Subject: [PATCH] netfilter: flowtable: add dsa support
Replace the master ethernet device by the dsa slave port. Packets coming
in from the software ingress path use the dsa slave port as input
device.
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
--- a/net/netfilter/nft_flow_offload.c
+++ b/net/netfilter/nft_flow_offload.c
@@ -89,6 +89,7 @@ static void nft_dev_path_info(const stru
path = &stack->path[i];
switch (path->type) {
case DEV_PATH_ETHERNET:
+ case DEV_PATH_DSA:
case DEV_PATH_VLAN:
case DEV_PATH_PPPOE:
info->indev = path->dev;
@@ -97,6 +98,10 @@ static void nft_dev_path_info(const stru
if (path->type == DEV_PATH_ETHERNET)
break;
+ if (path->type == DEV_PATH_DSA) {
+ i = stack->num_paths;
+ break;
+ }
/* DEV_PATH_VLAN and DEV_PATH_PPPOE */
if (info->num_encaps >= NF_FLOW_TABLE_ENCAP_MAX) {

View File

@ -1,107 +0,0 @@
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Wed, 24 Mar 2021 02:30:45 +0100
Subject: [PATCH] selftests: netfilter: flowtable bridge and vlan support
This patch adds two new tests to cover bridge and vlan support:
- Add a bridge device to the Router1 (nsr1) container and attach the
veth0 device to the bridge. Set the IP address to the bridge device
to exercise the bridge forwarding path.
- Add vlan encapsulation between to the bridge device in the Router1 and
one of the sender containers (ns1).
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
--- a/tools/testing/selftests/netfilter/nft_flowtable.sh
+++ b/tools/testing/selftests/netfilter/nft_flowtable.sh
@@ -370,6 +370,88 @@ else
ip netns exec nsr1 nft list ruleset
fi
+# Another test:
+# Add bridge interface br0 to Router1, with NAT enabled.
+ip -net nsr1 link add name br0 type bridge
+ip -net nsr1 addr flush dev veth0
+ip -net nsr1 link set up dev veth0
+ip -net nsr1 link set veth0 master br0
+ip -net nsr1 addr add 10.0.1.1/24 dev br0
+ip -net nsr1 addr add dead:1::1/64 dev br0
+ip -net nsr1 link set up dev br0
+
+ip netns exec nsr1 sysctl net.ipv4.conf.br0.forwarding=1 > /dev/null
+
+# br0 with NAT enabled.
+ip netns exec nsr1 nft -f - <<EOF
+flush table ip nat
+table ip nat {
+ chain prerouting {
+ type nat hook prerouting priority 0; policy accept;
+ meta iif "br0" ip daddr 10.6.6.6 tcp dport 1666 counter dnat ip to 10.0.2.99:12345
+ }
+
+ chain postrouting {
+ type nat hook postrouting priority 0; policy accept;
+ meta oifname "veth1" counter masquerade
+ }
+}
+EOF
+
+if test_tcp_forwarding_nat ns1 ns2; then
+ echo "PASS: flow offloaded for ns1/ns2 with bridge NAT"
+else
+ echo "FAIL: flow offload for ns1/ns2 with bridge NAT" 1>&2
+ ip netns exec nsr1 nft list ruleset
+ ret=1
+fi
+
+# Another test:
+# Add bridge interface br0 to Router1, with NAT and VLAN.
+ip -net nsr1 link set veth0 nomaster
+ip -net nsr1 link set down dev veth0
+ip -net nsr1 link add link veth0 name veth0.10 type vlan id 10
+ip -net nsr1 link set up dev veth0
+ip -net nsr1 link set up dev veth0.10
+ip -net nsr1 link set veth0.10 master br0
+
+ip -net ns1 addr flush dev eth0
+ip -net ns1 link add link eth0 name eth0.10 type vlan id 10
+ip -net ns1 link set eth0 up
+ip -net ns1 link set eth0.10 up
+ip -net ns1 addr add 10.0.1.99/24 dev eth0.10
+ip -net ns1 route add default via 10.0.1.1
+ip -net ns1 addr add dead:1::99/64 dev eth0.10
+
+if test_tcp_forwarding_nat ns1 ns2; then
+ echo "PASS: flow offloaded for ns1/ns2 with bridge NAT and VLAN"
+else
+ echo "FAIL: flow offload for ns1/ns2 with bridge NAT and VLAN" 1>&2
+ ip netns exec nsr1 nft list ruleset
+ ret=1
+fi
+
+# restore test topology (remove bridge and VLAN)
+ip -net nsr1 link set veth0 nomaster
+ip -net nsr1 link set veth0 down
+ip -net nsr1 link set veth0.10 down
+ip -net nsr1 link delete veth0.10 type vlan
+ip -net nsr1 link delete br0 type bridge
+ip -net ns1 addr flush dev eth0.10
+ip -net ns1 link set eth0.10 down
+ip -net ns1 link set eth0 down
+ip -net ns1 link delete eth0.10 type vlan
+
+# restore address in ns1 and nsr1
+ip -net ns1 link set eth0 up
+ip -net ns1 addr add 10.0.1.99/24 dev eth0
+ip -net ns1 route add default via 10.0.1.1
+ip -net ns1 addr add dead:1::99/64 dev eth0
+ip -net ns1 route add default via dead:1::1
+ip -net nsr1 addr add 10.0.1.1/24 dev veth0
+ip -net nsr1 addr add dead:1::1/64 dev veth0
+ip -net nsr1 link set up dev veth0
+
KEY_SHA="0x"$(ps -xaf | sha1sum | cut -d " " -f 1)
KEY_AES="0x"$(ps -xaf | md5sum | cut -d " " -f 1)
SPI1=$RANDOM

View File

@ -1,310 +0,0 @@
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Wed, 24 Mar 2021 02:30:46 +0100
Subject: [PATCH] netfilter: flowtable: add offload support for xmit path
types
When the flow tuple xmit_type is set to FLOW_OFFLOAD_XMIT_DIRECT, the
dst_cache pointer is not valid, and the h_source/h_dest/ifidx out fields
need to be used.
This patch also adds the FLOW_ACTION_VLAN_PUSH action to pass the VLAN
tag to the driver.
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
--- a/net/netfilter/nf_flow_table_offload.c
+++ b/net/netfilter/nf_flow_table_offload.c
@@ -177,28 +177,45 @@ static int flow_offload_eth_src(struct n
enum flow_offload_tuple_dir dir,
struct nf_flow_rule *flow_rule)
{
- const struct flow_offload_tuple *tuple = &flow->tuplehash[!dir].tuple;
struct flow_action_entry *entry0 = flow_action_entry_next(flow_rule);
struct flow_action_entry *entry1 = flow_action_entry_next(flow_rule);
- struct net_device *dev;
+ const struct flow_offload_tuple *other_tuple, *this_tuple;
+ struct net_device *dev = NULL;
+ const unsigned char *addr;
u32 mask, val;
u16 val16;
- dev = dev_get_by_index(net, tuple->iifidx);
- if (!dev)
- return -ENOENT;
+ this_tuple = &flow->tuplehash[dir].tuple;
+
+ switch (this_tuple->xmit_type) {
+ case FLOW_OFFLOAD_XMIT_DIRECT:
+ addr = this_tuple->out.h_source;
+ break;
+ case FLOW_OFFLOAD_XMIT_NEIGH:
+ other_tuple = &flow->tuplehash[!dir].tuple;
+ dev = dev_get_by_index(net, other_tuple->iifidx);
+ if (!dev)
+ return -ENOENT;
+
+ addr = dev->dev_addr;
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
mask = ~0xffff0000;
- memcpy(&val16, dev->dev_addr, 2);
+ memcpy(&val16, addr, 2);
val = val16 << 16;
flow_offload_mangle(entry0, FLOW_ACT_MANGLE_HDR_TYPE_ETH, 4,
&val, &mask);
mask = ~0xffffffff;
- memcpy(&val, dev->dev_addr + 2, 4);
+ memcpy(&val, addr + 2, 4);
flow_offload_mangle(entry1, FLOW_ACT_MANGLE_HDR_TYPE_ETH, 8,
&val, &mask);
- dev_put(dev);
+
+ if (dev)
+ dev_put(dev);
return 0;
}
@@ -210,27 +227,40 @@ static int flow_offload_eth_dst(struct n
{
struct flow_action_entry *entry0 = flow_action_entry_next(flow_rule);
struct flow_action_entry *entry1 = flow_action_entry_next(flow_rule);
- const void *daddr = &flow->tuplehash[!dir].tuple.src_v4;
+ const struct flow_offload_tuple *other_tuple, *this_tuple;
const struct dst_entry *dst_cache;
unsigned char ha[ETH_ALEN];
struct neighbour *n;
+ const void *daddr;
u32 mask, val;
u8 nud_state;
u16 val16;
- dst_cache = flow->tuplehash[dir].tuple.dst_cache;
- n = dst_neigh_lookup(dst_cache, daddr);
- if (!n)
- return -ENOENT;
-
- read_lock_bh(&n->lock);
- nud_state = n->nud_state;
- ether_addr_copy(ha, n->ha);
- read_unlock_bh(&n->lock);
+ this_tuple = &flow->tuplehash[dir].tuple;
- if (!(nud_state & NUD_VALID)) {
+ switch (this_tuple->xmit_type) {
+ case FLOW_OFFLOAD_XMIT_DIRECT:
+ ether_addr_copy(ha, this_tuple->out.h_dest);
+ break;
+ case FLOW_OFFLOAD_XMIT_NEIGH:
+ other_tuple = &flow->tuplehash[!dir].tuple;
+ daddr = &other_tuple->src_v4;
+ dst_cache = this_tuple->dst_cache;
+ n = dst_neigh_lookup(dst_cache, daddr);
+ if (!n)
+ return -ENOENT;
+
+ read_lock_bh(&n->lock);
+ nud_state = n->nud_state;
+ ether_addr_copy(ha, n->ha);
+ read_unlock_bh(&n->lock);
neigh_release(n);
- return -ENOENT;
+
+ if (!(nud_state & NUD_VALID))
+ return -ENOENT;
+ break;
+ default:
+ return -EOPNOTSUPP;
}
mask = ~0xffffffff;
@@ -243,7 +273,6 @@ static int flow_offload_eth_dst(struct n
val = val16;
flow_offload_mangle(entry1, FLOW_ACT_MANGLE_HDR_TYPE_ETH, 4,
&val, &mask);
- neigh_release(n);
return 0;
}
@@ -465,27 +494,52 @@ static void flow_offload_ipv4_checksum(s
}
}
-static void flow_offload_redirect(const struct flow_offload *flow,
+static void flow_offload_redirect(struct net *net,
+ const struct flow_offload *flow,
enum flow_offload_tuple_dir dir,
struct nf_flow_rule *flow_rule)
{
- struct flow_action_entry *entry = flow_action_entry_next(flow_rule);
- struct rtable *rt;
+ const struct flow_offload_tuple *this_tuple, *other_tuple;
+ struct flow_action_entry *entry;
+ struct net_device *dev;
+ int ifindex;
+
+ this_tuple = &flow->tuplehash[dir].tuple;
+ switch (this_tuple->xmit_type) {
+ case FLOW_OFFLOAD_XMIT_DIRECT:
+ this_tuple = &flow->tuplehash[dir].tuple;
+ ifindex = this_tuple->out.ifidx;
+ break;
+ case FLOW_OFFLOAD_XMIT_NEIGH:
+ other_tuple = &flow->tuplehash[!dir].tuple;
+ ifindex = other_tuple->iifidx;
+ break;
+ default:
+ return;
+ }
- rt = (struct rtable *)flow->tuplehash[dir].tuple.dst_cache;
+ dev = dev_get_by_index(net, ifindex);
+ if (!dev)
+ return;
+
+ entry = flow_action_entry_next(flow_rule);
entry->id = FLOW_ACTION_REDIRECT;
- entry->dev = rt->dst.dev;
- dev_hold(rt->dst.dev);
+ entry->dev = dev;
}
static void flow_offload_encap_tunnel(const struct flow_offload *flow,
enum flow_offload_tuple_dir dir,
struct nf_flow_rule *flow_rule)
{
+ const struct flow_offload_tuple *this_tuple;
struct flow_action_entry *entry;
struct dst_entry *dst;
- dst = flow->tuplehash[dir].tuple.dst_cache;
+ this_tuple = &flow->tuplehash[dir].tuple;
+ if (this_tuple->xmit_type == FLOW_OFFLOAD_XMIT_DIRECT)
+ return;
+
+ dst = this_tuple->dst_cache;
if (dst && dst->lwtstate) {
struct ip_tunnel_info *tun_info;
@@ -502,10 +556,15 @@ static void flow_offload_decap_tunnel(co
enum flow_offload_tuple_dir dir,
struct nf_flow_rule *flow_rule)
{
+ const struct flow_offload_tuple *other_tuple;
struct flow_action_entry *entry;
struct dst_entry *dst;
- dst = flow->tuplehash[!dir].tuple.dst_cache;
+ other_tuple = &flow->tuplehash[!dir].tuple;
+ if (other_tuple->xmit_type == FLOW_OFFLOAD_XMIT_DIRECT)
+ return;
+
+ dst = other_tuple->dst_cache;
if (dst && dst->lwtstate) {
struct ip_tunnel_info *tun_info;
@@ -517,10 +576,14 @@ static void flow_offload_decap_tunnel(co
}
}
-int nf_flow_rule_route_ipv4(struct net *net, const struct flow_offload *flow,
- enum flow_offload_tuple_dir dir,
- struct nf_flow_rule *flow_rule)
+static int
+nf_flow_rule_route_common(struct net *net, const struct flow_offload *flow,
+ enum flow_offload_tuple_dir dir,
+ struct nf_flow_rule *flow_rule)
{
+ const struct flow_offload_tuple *other_tuple;
+ int i;
+
flow_offload_decap_tunnel(flow, dir, flow_rule);
flow_offload_encap_tunnel(flow, dir, flow_rule);
@@ -528,6 +591,26 @@ int nf_flow_rule_route_ipv4(struct net *
flow_offload_eth_dst(net, flow, dir, flow_rule) < 0)
return -1;
+ other_tuple = &flow->tuplehash[!dir].tuple;
+
+ for (i = 0; i < other_tuple->encap_num; i++) {
+ struct flow_action_entry *entry = flow_action_entry_next(flow_rule);
+
+ entry->id = FLOW_ACTION_VLAN_PUSH;
+ entry->vlan.vid = other_tuple->encap[i].id;
+ entry->vlan.proto = other_tuple->encap[i].proto;
+ }
+
+ return 0;
+}
+
+int nf_flow_rule_route_ipv4(struct net *net, const struct flow_offload *flow,
+ enum flow_offload_tuple_dir dir,
+ struct nf_flow_rule *flow_rule)
+{
+ if (nf_flow_rule_route_common(net, flow, dir, flow_rule) < 0)
+ return -1;
+
if (test_bit(NF_FLOW_SNAT, &flow->flags)) {
flow_offload_ipv4_snat(net, flow, dir, flow_rule);
flow_offload_port_snat(net, flow, dir, flow_rule);
@@ -540,7 +623,7 @@ int nf_flow_rule_route_ipv4(struct net *
test_bit(NF_FLOW_DNAT, &flow->flags))
flow_offload_ipv4_checksum(net, flow, flow_rule);
- flow_offload_redirect(flow, dir, flow_rule);
+ flow_offload_redirect(net, flow, dir, flow_rule);
return 0;
}
@@ -550,11 +633,7 @@ int nf_flow_rule_route_ipv6(struct net *
enum flow_offload_tuple_dir dir,
struct nf_flow_rule *flow_rule)
{
- flow_offload_decap_tunnel(flow, dir, flow_rule);
- flow_offload_encap_tunnel(flow, dir, flow_rule);
-
- if (flow_offload_eth_src(net, flow, dir, flow_rule) < 0 ||
- flow_offload_eth_dst(net, flow, dir, flow_rule) < 0)
+ if (nf_flow_rule_route_common(net, flow, dir, flow_rule) < 0)
return -1;
if (test_bit(NF_FLOW_SNAT, &flow->flags)) {
@@ -566,7 +645,7 @@ int nf_flow_rule_route_ipv6(struct net *
flow_offload_port_dnat(net, flow, dir, flow_rule);
}
- flow_offload_redirect(flow, dir, flow_rule);
+ flow_offload_redirect(net, flow, dir, flow_rule);
return 0;
}
@@ -580,10 +659,10 @@ nf_flow_offload_rule_alloc(struct net *n
enum flow_offload_tuple_dir dir)
{
const struct nf_flowtable *flowtable = offload->flowtable;
+ const struct flow_offload_tuple *tuple, *other_tuple;
const struct flow_offload *flow = offload->flow;
- const struct flow_offload_tuple *tuple;
+ struct dst_entry *other_dst = NULL;
struct nf_flow_rule *flow_rule;
- struct dst_entry *other_dst;
int err = -ENOMEM;
flow_rule = kzalloc(sizeof(*flow_rule), GFP_KERNEL);
@@ -599,7 +678,10 @@ nf_flow_offload_rule_alloc(struct net *n
flow_rule->rule->match.key = &flow_rule->match.key;
tuple = &flow->tuplehash[dir].tuple;
- other_dst = flow->tuplehash[!dir].tuple.dst_cache;
+ other_tuple = &flow->tuplehash[!dir].tuple;
+ if (other_tuple->xmit_type == FLOW_OFFLOAD_XMIT_NEIGH)
+ other_dst = other_tuple->dst_cache;
+
err = nf_flow_rule_match(&flow_rule->match, tuple, other_dst);
if (err < 0)
goto err_flow_match;

View File

@ -1,114 +0,0 @@
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Wed, 24 Mar 2021 02:30:47 +0100
Subject: [PATCH] netfilter: nft_flow_offload: use direct xmit if
hardware offload is enabled
If there is a forward path to reach an ethernet device and hardware
offload is enabled, then use the direct xmit path.
Moreover, store the real device in the direct xmit path info since
software datapath uses dev_hard_header() to push the layer encapsulation
headers while hardware offload refers to the real device.
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
--- a/include/net/netfilter/nf_flow_table.h
+++ b/include/net/netfilter/nf_flow_table.h
@@ -131,6 +131,7 @@ struct flow_offload_tuple {
struct dst_entry *dst_cache;
struct {
u32 ifidx;
+ u32 hw_ifidx;
u8 h_source[ETH_ALEN];
u8 h_dest[ETH_ALEN];
} out;
@@ -187,6 +188,7 @@ struct nf_flow_route {
} in;
struct {
u32 ifindex;
+ u32 hw_ifindex;
u8 h_source[ETH_ALEN];
u8 h_dest[ETH_ALEN];
} out;
--- a/net/netfilter/nf_flow_table_core.c
+++ b/net/netfilter/nf_flow_table_core.c
@@ -106,6 +106,7 @@ static int flow_offload_fill_route(struc
memcpy(flow_tuple->out.h_source, route->tuple[dir].out.h_source,
ETH_ALEN);
flow_tuple->out.ifidx = route->tuple[dir].out.ifindex;
+ flow_tuple->out.hw_ifidx = route->tuple[dir].out.hw_ifindex;
break;
case FLOW_OFFLOAD_XMIT_XFRM:
case FLOW_OFFLOAD_XMIT_NEIGH:
--- a/net/netfilter/nf_flow_table_offload.c
+++ b/net/netfilter/nf_flow_table_offload.c
@@ -508,7 +508,7 @@ static void flow_offload_redirect(struct
switch (this_tuple->xmit_type) {
case FLOW_OFFLOAD_XMIT_DIRECT:
this_tuple = &flow->tuplehash[dir].tuple;
- ifindex = this_tuple->out.ifidx;
+ ifindex = this_tuple->out.hw_ifidx;
break;
case FLOW_OFFLOAD_XMIT_NEIGH:
other_tuple = &flow->tuplehash[!dir].tuple;
--- a/net/netfilter/nft_flow_offload.c
+++ b/net/netfilter/nft_flow_offload.c
@@ -66,6 +66,7 @@ static int nft_dev_fill_forward_path(con
struct nft_forward_info {
const struct net_device *indev;
const struct net_device *outdev;
+ const struct net_device *hw_outdev;
struct id {
__u16 id;
__be16 proto;
@@ -76,9 +77,18 @@ struct nft_forward_info {
enum flow_offload_xmit_type xmit_type;
};
+static bool nft_is_valid_ether_device(const struct net_device *dev)
+{
+ if (!dev || (dev->flags & IFF_LOOPBACK) || dev->type != ARPHRD_ETHER ||
+ dev->addr_len != ETH_ALEN || !is_valid_ether_addr(dev->dev_addr))
+ return false;
+
+ return true;
+}
+
static void nft_dev_path_info(const struct net_device_path_stack *stack,
struct nft_forward_info *info,
- unsigned char *ha)
+ unsigned char *ha, struct nf_flowtable *flowtable)
{
const struct net_device_path *path;
int i;
@@ -140,6 +150,12 @@ static void nft_dev_path_info(const stru
}
if (!info->outdev)
info->outdev = info->indev;
+
+ info->hw_outdev = info->indev;
+
+ if (nf_flowtable_hw_offload(flowtable) &&
+ nft_is_valid_ether_device(info->indev))
+ info->xmit_type = FLOW_OFFLOAD_XMIT_DIRECT;
}
static bool nft_flowtable_find_dev(const struct net_device *dev,
@@ -171,7 +187,7 @@ static void nft_dev_forward_path(struct
int i;
if (nft_dev_fill_forward_path(route, dst, ct, dir, ha, &stack) >= 0)
- nft_dev_path_info(&stack, &info, ha);
+ nft_dev_path_info(&stack, &info, ha, &ft->data);
if (!info.indev || !nft_flowtable_find_dev(info.indev, ft))
return;
@@ -187,6 +203,7 @@ static void nft_dev_forward_path(struct
memcpy(route->tuple[dir].out.h_source, info.h_source, ETH_ALEN);
memcpy(route->tuple[dir].out.h_dest, info.h_dest, ETH_ALEN);
route->tuple[dir].out.ifindex = info.outdev->ifindex;
+ route->tuple[dir].out.hw_ifindex = info.hw_outdev->ifindex;
route->tuple[dir].xmit_type = info.xmit_type;
}
}

View File

@ -1,123 +0,0 @@
From: Felix Fietkau <nbd@nbd.name>
Date: Wed, 24 Mar 2021 02:30:48 +0100
Subject: [PATCH] netfilter: flowtable: bridge vlan hardware offload and
switchdev
The switch might have already added the VLAN tag through PVID hardware
offload. Keep this extra VLAN in the flowtable but skip it on egress.
Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -849,6 +849,7 @@ struct net_device_path {
DEV_PATH_BR_VLAN_KEEP,
DEV_PATH_BR_VLAN_TAG,
DEV_PATH_BR_VLAN_UNTAG,
+ DEV_PATH_BR_VLAN_UNTAG_HW,
} vlan_mode;
u16 vlan_id;
__be16 vlan_proto;
--- a/include/net/netfilter/nf_flow_table.h
+++ b/include/net/netfilter/nf_flow_table.h
@@ -123,9 +123,10 @@ struct flow_offload_tuple {
/* All members above are keys for lookups, see flow_offload_hash(). */
struct { } __hash;
- u8 dir:4,
+ u8 dir:2,
xmit_type:2,
- encap_num:2;
+ encap_num:2,
+ in_vlan_ingress:2;
u16 mtu;
union {
struct dst_entry *dst_cache;
@@ -184,7 +185,8 @@ struct nf_flow_route {
u16 id;
__be16 proto;
} encap[NF_FLOW_TABLE_ENCAP_MAX];
- u8 num_encaps;
+ u8 num_encaps:2,
+ ingress_vlans:2;
} in;
struct {
u32 ifindex;
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -435,6 +435,7 @@ static int br_fill_forward_path(struct n
ctx->vlan[ctx->num_vlans].proto = path->bridge.vlan_proto;
ctx->num_vlans++;
break;
+ case DEV_PATH_BR_VLAN_UNTAG_HW:
case DEV_PATH_BR_VLAN_UNTAG:
ctx->num_vlans--;
break;
--- a/net/bridge/br_vlan.c
+++ b/net/bridge/br_vlan.c
@@ -1374,6 +1374,8 @@ int br_vlan_fill_forward_path_mode(struc
if (path->bridge.vlan_mode == DEV_PATH_BR_VLAN_TAG)
path->bridge.vlan_mode = DEV_PATH_BR_VLAN_KEEP;
+ else if (v->priv_flags & BR_VLFLAG_ADDED_BY_SWITCHDEV)
+ path->bridge.vlan_mode = DEV_PATH_BR_VLAN_UNTAG_HW;
else
path->bridge.vlan_mode = DEV_PATH_BR_VLAN_UNTAG;
--- a/net/netfilter/nf_flow_table_core.c
+++ b/net/netfilter/nf_flow_table_core.c
@@ -95,6 +95,8 @@ static int flow_offload_fill_route(struc
for (i = route->tuple[dir].in.num_encaps - 1; i >= 0; i--) {
flow_tuple->encap[j].id = route->tuple[dir].in.encap[i].id;
flow_tuple->encap[j].proto = route->tuple[dir].in.encap[i].proto;
+ if (route->tuple[dir].in.ingress_vlans & BIT(i))
+ flow_tuple->in_vlan_ingress |= BIT(j);
j++;
}
flow_tuple->encap_num = route->tuple[dir].in.num_encaps;
--- a/net/netfilter/nf_flow_table_offload.c
+++ b/net/netfilter/nf_flow_table_offload.c
@@ -594,8 +594,12 @@ nf_flow_rule_route_common(struct net *ne
other_tuple = &flow->tuplehash[!dir].tuple;
for (i = 0; i < other_tuple->encap_num; i++) {
- struct flow_action_entry *entry = flow_action_entry_next(flow_rule);
+ struct flow_action_entry *entry;
+ if (other_tuple->in_vlan_ingress & BIT(i))
+ continue;
+
+ entry = flow_action_entry_next(flow_rule);
entry->id = FLOW_ACTION_VLAN_PUSH;
entry->vlan.vid = other_tuple->encap[i].id;
entry->vlan.proto = other_tuple->encap[i].proto;
--- a/net/netfilter/nft_flow_offload.c
+++ b/net/netfilter/nft_flow_offload.c
@@ -72,6 +72,7 @@ struct nft_forward_info {
__be16 proto;
} encap[NF_FLOW_TABLE_ENCAP_MAX];
u8 num_encaps;
+ u8 ingress_vlans;
u8 h_source[ETH_ALEN];
u8 h_dest[ETH_ALEN];
enum flow_offload_xmit_type xmit_type;
@@ -130,6 +131,9 @@ static void nft_dev_path_info(const stru
memcpy(info->h_source, path->dev->dev_addr, ETH_ALEN);
switch (path->bridge.vlan_mode) {
+ case DEV_PATH_BR_VLAN_UNTAG_HW:
+ info->ingress_vlans |= BIT(info->num_encaps - 1);
+ break;
case DEV_PATH_BR_VLAN_TAG:
info->encap[info->num_encaps].id = path->bridge.vlan_id;
info->encap[info->num_encaps].proto = path->bridge.vlan_proto;
@@ -198,6 +202,7 @@ static void nft_dev_forward_path(struct
route->tuple[!dir].in.encap[i].proto = info.encap[i].proto;
}
route->tuple[!dir].in.num_encaps = info.num_encaps;
+ route->tuple[!dir].in.ingress_vlans = info.ingress_vlans;
if (info.xmit_type == FLOW_OFFLOAD_XMIT_DIRECT) {
memcpy(route->tuple[dir].out.h_source, info.h_source, ETH_ALEN);

View File

@ -1,30 +0,0 @@
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Wed, 24 Mar 2021 02:30:49 +0100
Subject: [PATCH] net: flow_offload: add FLOW_ACTION_PPPOE_PUSH
Add an action to represent the PPPoE hardware offload support that
includes the session ID.
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
--- a/include/net/flow_offload.h
+++ b/include/net/flow_offload.h
@@ -147,6 +147,7 @@ enum flow_action_id {
FLOW_ACTION_MPLS_POP,
FLOW_ACTION_MPLS_MANGLE,
FLOW_ACTION_GATE,
+ FLOW_ACTION_PPPOE_PUSH,
NUM_FLOW_ACTIONS,
};
@@ -271,6 +272,9 @@ struct flow_action_entry {
u32 num_entries;
struct action_gate_entry *entries;
} gate;
+ struct { /* FLOW_ACTION_PPPOE_PUSH */
+ u16 sid;
+ } pppoe;
};
struct flow_action_cookie *cookie; /* user defined action cookie */
};

View File

@ -1,35 +0,0 @@
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Wed, 24 Mar 2021 02:30:50 +0100
Subject: [PATCH] netfilter: flowtable: support for
FLOW_ACTION_PPPOE_PUSH
Add a PPPoE push action if layer 2 protocol is ETH_P_PPP_SES to add
PPPoE flowtable hardware offload support.
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
--- a/net/netfilter/nf_flow_table_offload.c
+++ b/net/netfilter/nf_flow_table_offload.c
@@ -600,9 +600,18 @@ nf_flow_rule_route_common(struct net *ne
continue;
entry = flow_action_entry_next(flow_rule);
- entry->id = FLOW_ACTION_VLAN_PUSH;
- entry->vlan.vid = other_tuple->encap[i].id;
- entry->vlan.proto = other_tuple->encap[i].proto;
+
+ switch (other_tuple->encap[i].proto) {
+ case htons(ETH_P_PPP_SES):
+ entry->id = FLOW_ACTION_PPPOE_PUSH;
+ entry->pppoe.sid = other_tuple->encap[i].id;
+ break;
+ case htons(ETH_P_8021Q):
+ entry->id = FLOW_ACTION_VLAN_PUSH;
+ entry->vlan.vid = other_tuple->encap[i].id;
+ entry->vlan.proto = other_tuple->encap[i].proto;
+ break;
+ }
}
return 0;

View File

@ -1,53 +0,0 @@
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Wed, 24 Mar 2021 02:30:51 +0100
Subject: [PATCH] dsa: slave: add support for TC_SETUP_FT
The dsa infrastructure provides a well-defined hierarchy of devices,
pass up the call to set up the flow block to the master device. From the
software dataplane, the netfilter infrastructure uses the dsa slave
devices to refer to the input and output device for the given skbuff.
Similarly, the flowtable definition in the ruleset refers to the dsa
slave port devices.
This patch adds the glue code to call ndo_setup_tc with TC_SETUP_FT
with the master device via the dsa slave devices.
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -1239,14 +1239,32 @@ static int dsa_slave_setup_tc_block(stru
}
}
+static int dsa_slave_setup_ft_block(struct dsa_switch *ds, int port,
+ void *type_data)
+{
+ struct dsa_port *cpu_dp = dsa_to_port(ds, port)->cpu_dp;
+ struct net_device *master = cpu_dp->master;
+
+ if (!master->netdev_ops->ndo_setup_tc)
+ return -EOPNOTSUPP;
+
+ return master->netdev_ops->ndo_setup_tc(master, TC_SETUP_FT, type_data);
+}
+
static int dsa_slave_setup_tc(struct net_device *dev, enum tc_setup_type type,
void *type_data)
{
struct dsa_port *dp = dsa_slave_to_port(dev);
struct dsa_switch *ds = dp->ds;
- if (type == TC_SETUP_BLOCK)
+ switch (type) {
+ case TC_SETUP_BLOCK:
return dsa_slave_setup_tc_block(dev, type_data);
+ case TC_SETUP_FT:
+ return dsa_slave_setup_ft_block(ds, dp->index, type_data);
+ default:
+ break;
+ }
if (!ds->ops->port_setup_tc)
return -EOPNOTSUPP;

View File

@ -1,68 +0,0 @@
From: Felix Fietkau <nbd@nbd.name>
Date: Wed, 24 Mar 2021 02:30:52 +0100
Subject: [PATCH] net: ethernet: mtk_eth_soc: fix parsing packets in GDM
When using DSA, set the special tag in GDM ingress control to allow the MAC
to parse packets properly earlier. This affects rx DMA source port reporting.
Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
--- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c
+++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
@@ -19,6 +19,7 @@
#include <linux/interrupt.h>
#include <linux/pinctrl/devinfo.h>
#include <linux/phylink.h>
+#include <net/dsa.h>
#include "mtk_eth_soc.h"
@@ -1285,13 +1286,12 @@ static int mtk_poll_rx(struct napi_struc
break;
/* find out which mac the packet come from. values start at 1 */
- if (MTK_HAS_CAPS(eth->soc->caps, MTK_SOC_MT7628)) {
+ if (MTK_HAS_CAPS(eth->soc->caps, MTK_SOC_MT7628) ||
+ (trxd.rxd4 & RX_DMA_SPECIAL_TAG))
mac = 0;
- } else {
- mac = (trxd.rxd4 >> RX_DMA_FPORT_SHIFT) &
- RX_DMA_FPORT_MASK;
- mac--;
- }
+ else
+ mac = ((trxd.rxd4 >> RX_DMA_FPORT_SHIFT) &
+ RX_DMA_FPORT_MASK) - 1;
if (unlikely(mac < 0 || mac >= MTK_MAC_COUNT ||
!eth->netdev[mac]))
@@ -2254,6 +2254,9 @@ static void mtk_gdm_config(struct mtk_et
val |= config;
+ if (!i && eth->netdev[0] && netdev_uses_dsa(eth->netdev[0]))
+ val |= MTK_GDMA_SPECIAL_TAG;
+
mtk_w32(eth, val, MTK_GDMA_FWD_CFG(i));
}
/* Reset and enable PSE */
--- a/drivers/net/ethernet/mediatek/mtk_eth_soc.h
+++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.h
@@ -81,6 +81,7 @@
/* GDM Exgress Control Register */
#define MTK_GDMA_FWD_CFG(x) (0x500 + (x * 0x1000))
+#define MTK_GDMA_SPECIAL_TAG BIT(24)
#define MTK_GDMA_ICS_EN BIT(22)
#define MTK_GDMA_TCS_EN BIT(21)
#define MTK_GDMA_UCS_EN BIT(20)
@@ -318,6 +319,7 @@
#define RX_DMA_L4_VALID_PDMA BIT(30) /* when PDMA is used */
#define RX_DMA_FPORT_SHIFT 19
#define RX_DMA_FPORT_MASK 0x7
+#define RX_DMA_SPECIAL_TAG BIT(22)
/* PHY Indirect Access Control registers */
#define MTK_PHY_IAC 0x10004

View File

@ -1,568 +0,0 @@
From: Felix Fietkau <nbd@nbd.name>
Date: Wed, 24 Mar 2021 02:30:54 +0100
Subject: [PATCH] net: ethernet: mtk_eth_soc: add flow offloading support
This adds support for offloading IPv4 routed flows, including SNAT/DNAT,
one VLAN, PPPoE and DSA.
Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
create mode 100644 drivers/net/ethernet/mediatek/mtk_ppe_offload.c
--- a/drivers/net/ethernet/mediatek/Makefile
+++ b/drivers/net/ethernet/mediatek/Makefile
@@ -4,5 +4,5 @@
#
obj-$(CONFIG_NET_MEDIATEK_SOC) += mtk_eth.o
-mtk_eth-y := mtk_eth_soc.o mtk_sgmii.o mtk_eth_path.o mtk_ppe.o mtk_ppe_debugfs.o
+mtk_eth-y := mtk_eth_soc.o mtk_sgmii.o mtk_eth_path.o mtk_ppe.o mtk_ppe_debugfs.o mtk_ppe_offload.o
obj-$(CONFIG_NET_MEDIATEK_STAR_EMAC) += mtk_star_emac.o
--- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c
+++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
@@ -2834,6 +2834,7 @@ static const struct net_device_ops mtk_n
#ifdef CONFIG_NET_POLL_CONTROLLER
.ndo_poll_controller = mtk_poll_controller,
#endif
+ .ndo_setup_tc = mtk_eth_setup_tc,
};
static int mtk_add_mac(struct mtk_eth *eth, struct device_node *np)
@@ -3092,6 +3093,10 @@ static int mtk_probe(struct platform_dev
eth->base + MTK_ETH_PPE_BASE, 2);
if (err)
goto err_free_dev;
+
+ err = mtk_eth_offload_init(eth);
+ if (err)
+ goto err_free_dev;
}
for (i = 0; i < MTK_MAX_DEVS; i++) {
--- a/drivers/net/ethernet/mediatek/mtk_eth_soc.h
+++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.h
@@ -15,6 +15,7 @@
#include <linux/u64_stats_sync.h>
#include <linux/refcount.h>
#include <linux/phylink.h>
+#include <linux/rhashtable.h>
#include "mtk_ppe.h"
#define MTK_QDMA_PAGE_SIZE 2048
@@ -40,7 +41,8 @@
NETIF_F_HW_VLAN_CTAG_RX | \
NETIF_F_SG | NETIF_F_TSO | \
NETIF_F_TSO6 | \
- NETIF_F_IPV6_CSUM)
+ NETIF_F_IPV6_CSUM |\
+ NETIF_F_HW_TC)
#define MTK_HW_FEATURES_MT7628 (NETIF_F_SG | NETIF_F_RXCSUM)
#define NEXT_DESP_IDX(X, Y) (((X) + 1) & ((Y) - 1))
@@ -929,6 +931,7 @@ struct mtk_eth {
int ip_align;
struct mtk_ppe ppe;
+ struct rhashtable flow_table;
};
/* struct mtk_mac - the structure that holds the info about the MACs of the
@@ -973,4 +976,9 @@ int mtk_gmac_sgmii_path_setup(struct mtk
int mtk_gmac_gephy_path_setup(struct mtk_eth *eth, int mac_id);
int mtk_gmac_rgmii_path_setup(struct mtk_eth *eth, int mac_id);
+int mtk_eth_offload_init(struct mtk_eth *eth);
+int mtk_eth_setup_tc(struct net_device *dev, enum tc_setup_type type,
+ void *type_data);
+
+
#endif /* MTK_ETH_H */
--- /dev/null
+++ b/drivers/net/ethernet/mediatek/mtk_ppe_offload.c
@@ -0,0 +1,485 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2020 Felix Fietkau <nbd@nbd.name>
+ */
+
+#include <linux/if_ether.h>
+#include <linux/rhashtable.h>
+#include <linux/if_ether.h>
+#include <linux/ip.h>
+#include <net/flow_offload.h>
+#include <net/pkt_cls.h>
+#include <net/dsa.h>
+#include "mtk_eth_soc.h"
+
+struct mtk_flow_data {
+ struct ethhdr eth;
+
+ union {
+ struct {
+ __be32 src_addr;
+ __be32 dst_addr;
+ } v4;
+ };
+
+ __be16 src_port;
+ __be16 dst_port;
+
+ struct {
+ u16 id;
+ __be16 proto;
+ u8 num;
+ } vlan;
+ struct {
+ u16 sid;
+ u8 num;
+ } pppoe;
+};
+
+struct mtk_flow_entry {
+ struct rhash_head node;
+ unsigned long cookie;
+ u16 hash;
+};
+
+static const struct rhashtable_params mtk_flow_ht_params = {
+ .head_offset = offsetof(struct mtk_flow_entry, node),
+ .head_offset = offsetof(struct mtk_flow_entry, cookie),
+ .key_len = sizeof(unsigned long),
+ .automatic_shrinking = true,
+};
+
+static u32
+mtk_eth_timestamp(struct mtk_eth *eth)
+{
+ return mtk_r32(eth, 0x0010) & MTK_FOE_IB1_BIND_TIMESTAMP;
+}
+
+static int
+mtk_flow_set_ipv4_addr(struct mtk_foe_entry *foe, struct mtk_flow_data *data,
+ bool egress)
+{
+ return mtk_foe_entry_set_ipv4_tuple(foe, egress,
+ data->v4.src_addr, data->src_port,
+ data->v4.dst_addr, data->dst_port);
+}
+
+static void
+mtk_flow_offload_mangle_eth(const struct flow_action_entry *act, void *eth)
+{
+ void *dest = eth + act->mangle.offset;
+ const void *src = &act->mangle.val;
+
+ if (act->mangle.offset > 8)
+ return;
+
+ if (act->mangle.mask == 0xffff) {
+ src += 2;
+ dest += 2;
+ }
+
+ memcpy(dest, src, act->mangle.mask ? 2 : 4);
+}
+
+
+static int
+mtk_flow_mangle_ports(const struct flow_action_entry *act,
+ struct mtk_flow_data *data)
+{
+ u32 val = ntohl(act->mangle.val);
+
+ switch (act->mangle.offset) {
+ case 0:
+ if (act->mangle.mask == ~htonl(0xffff))
+ data->dst_port = cpu_to_be16(val);
+ else
+ data->src_port = cpu_to_be16(val >> 16);
+ break;
+ case 2:
+ data->dst_port = cpu_to_be16(val);
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int
+mtk_flow_mangle_ipv4(const struct flow_action_entry *act,
+ struct mtk_flow_data *data)
+{
+ __be32 *dest;
+
+ switch (act->mangle.offset) {
+ case offsetof(struct iphdr, saddr):
+ dest = &data->v4.src_addr;
+ break;
+ case offsetof(struct iphdr, daddr):
+ dest = &data->v4.dst_addr;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ memcpy(dest, &act->mangle.val, sizeof(u32));
+
+ return 0;
+}
+
+static int
+mtk_flow_get_dsa_port(struct net_device **dev)
+{
+#if IS_ENABLED(CONFIG_NET_DSA)
+ struct dsa_port *dp;
+
+ dp = dsa_port_from_netdev(*dev);
+ if (IS_ERR(dp))
+ return -ENODEV;
+
+ if (dp->cpu_dp->tag_ops->proto != DSA_TAG_PROTO_MTK)
+ return -ENODEV;
+
+ *dev = dp->cpu_dp->master;
+
+ return dp->index;
+#else
+ return -ENODEV;
+#endif
+}
+
+static int
+mtk_flow_set_output_device(struct mtk_eth *eth, struct mtk_foe_entry *foe,
+ struct net_device *dev)
+{
+ int pse_port, dsa_port;
+
+ dsa_port = mtk_flow_get_dsa_port(&dev);
+ if (dsa_port >= 0)
+ mtk_foe_entry_set_dsa(foe, dsa_port);
+
+ if (dev == eth->netdev[0])
+ pse_port = 1;
+ else if (dev == eth->netdev[1])
+ pse_port = 2;
+ else
+ return -EOPNOTSUPP;
+
+ mtk_foe_entry_set_pse_port(foe, pse_port);
+
+ return 0;
+}
+
+static int
+mtk_flow_offload_replace(struct mtk_eth *eth, struct flow_cls_offload *f)
+{
+ struct flow_rule *rule = flow_cls_offload_flow_rule(f);
+ struct flow_action_entry *act;
+ struct mtk_flow_data data = {};
+ struct mtk_foe_entry foe;
+ struct net_device *odev = NULL;
+ struct mtk_flow_entry *entry;
+ int offload_type = 0;
+ u16 addr_type = 0;
+ u32 timestamp;
+ u8 l4proto = 0;
+ int err = 0;
+ int hash;
+ int i;
+
+ if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_META)) {
+ struct flow_match_meta match;
+
+ flow_rule_match_meta(rule, &match);
+ } else {
+ return -EOPNOTSUPP;
+ }
+
+ if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_CONTROL)) {
+ struct flow_match_control match;
+
+ flow_rule_match_control(rule, &match);
+ addr_type = match.key->addr_type;
+ } else {
+ return -EOPNOTSUPP;
+ }
+
+ if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_BASIC)) {
+ struct flow_match_basic match;
+
+ flow_rule_match_basic(rule, &match);
+ l4proto = match.key->ip_proto;
+ } else {
+ return -EOPNOTSUPP;
+ }
+
+ flow_action_for_each(i, act, &rule->action) {
+ switch (act->id) {
+ case FLOW_ACTION_MANGLE:
+ if (act->mangle.htype == FLOW_ACT_MANGLE_HDR_TYPE_ETH)
+ mtk_flow_offload_mangle_eth(act, &data.eth);
+ break;
+ case FLOW_ACTION_REDIRECT:
+ odev = act->dev;
+ break;
+ case FLOW_ACTION_CSUM:
+ break;
+ case FLOW_ACTION_VLAN_PUSH:
+ if (data.vlan.num == 1 ||
+ act->vlan.proto != htons(ETH_P_8021Q))
+ return -EOPNOTSUPP;
+
+ data.vlan.id = act->vlan.vid;
+ data.vlan.proto = act->vlan.proto;
+ data.vlan.num++;
+ break;
+ case FLOW_ACTION_PPPOE_PUSH:
+ if (data.pppoe.num == 1)
+ return -EOPNOTSUPP;
+
+ data.pppoe.sid = act->pppoe.sid;
+ data.pppoe.num++;
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+ }
+
+ switch (addr_type) {
+ case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
+ offload_type = MTK_PPE_PKT_TYPE_IPV4_HNAPT;
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ if (!is_valid_ether_addr(data.eth.h_source) ||
+ !is_valid_ether_addr(data.eth.h_dest))
+ return -EINVAL;
+
+ err = mtk_foe_entry_prepare(&foe, offload_type, l4proto, 0,
+ data.eth.h_source,
+ data.eth.h_dest);
+ if (err)
+ return err;
+
+ if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_PORTS)) {
+ struct flow_match_ports ports;
+
+ flow_rule_match_ports(rule, &ports);
+ data.src_port = ports.key->src;
+ data.dst_port = ports.key->dst;
+ } else {
+ return -EOPNOTSUPP;
+ }
+
+ if (addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
+ struct flow_match_ipv4_addrs addrs;
+
+ flow_rule_match_ipv4_addrs(rule, &addrs);
+
+ data.v4.src_addr = addrs.key->src;
+ data.v4.dst_addr = addrs.key->dst;
+
+ mtk_flow_set_ipv4_addr(&foe, &data, false);
+ }
+
+ flow_action_for_each(i, act, &rule->action) {
+ if (act->id != FLOW_ACTION_MANGLE)
+ continue;
+
+ switch (act->mangle.htype) {
+ case FLOW_ACT_MANGLE_HDR_TYPE_TCP:
+ case FLOW_ACT_MANGLE_HDR_TYPE_UDP:
+ err = mtk_flow_mangle_ports(act, &data);
+ break;
+ case FLOW_ACT_MANGLE_HDR_TYPE_IP4:
+ err = mtk_flow_mangle_ipv4(act, &data);
+ break;
+ case FLOW_ACT_MANGLE_HDR_TYPE_ETH:
+ /* handled earlier */
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ if (err)
+ return err;
+ }
+
+ if (addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
+ err = mtk_flow_set_ipv4_addr(&foe, &data, true);
+ if (err)
+ return err;
+ }
+
+ if (data.vlan.num == 1) {
+ if (data.vlan.proto != htons(ETH_P_8021Q))
+ return -EOPNOTSUPP;
+
+ mtk_foe_entry_set_vlan(&foe, data.vlan.id);
+ }
+ if (data.pppoe.num == 1)
+ mtk_foe_entry_set_pppoe(&foe, data.pppoe.sid);
+
+ err = mtk_flow_set_output_device(eth, &foe, odev);
+ if (err)
+ return err;
+
+ entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+ if (!entry)
+ return -ENOMEM;
+
+ entry->cookie = f->cookie;
+ timestamp = mtk_eth_timestamp(eth);
+ hash = mtk_foe_entry_commit(&eth->ppe, &foe, timestamp);
+ if (hash < 0) {
+ err = hash;
+ goto free;
+ }
+
+ entry->hash = hash;
+ err = rhashtable_insert_fast(&eth->flow_table, &entry->node,
+ mtk_flow_ht_params);
+ if (err < 0)
+ goto clear_flow;
+
+ return 0;
+clear_flow:
+ mtk_foe_entry_clear(&eth->ppe, hash);
+free:
+ kfree(entry);
+ return err;
+}
+
+static int
+mtk_flow_offload_destroy(struct mtk_eth *eth, struct flow_cls_offload *f)
+{
+ struct mtk_flow_entry *entry;
+
+ entry = rhashtable_lookup(&eth->flow_table, &f->cookie,
+ mtk_flow_ht_params);
+ if (!entry)
+ return -ENOENT;
+
+ mtk_foe_entry_clear(&eth->ppe, entry->hash);
+ rhashtable_remove_fast(&eth->flow_table, &entry->node,
+ mtk_flow_ht_params);
+ kfree(entry);
+
+ return 0;
+}
+
+static int
+mtk_flow_offload_stats(struct mtk_eth *eth, struct flow_cls_offload *f)
+{
+ struct mtk_flow_entry *entry;
+ int timestamp;
+ u32 idle;
+
+ entry = rhashtable_lookup(&eth->flow_table, &f->cookie,
+ mtk_flow_ht_params);
+ if (!entry)
+ return -ENOENT;
+
+ timestamp = mtk_foe_entry_timestamp(&eth->ppe, entry->hash);
+ if (timestamp < 0)
+ return -ETIMEDOUT;
+
+ idle = mtk_eth_timestamp(eth) - timestamp;
+ f->stats.lastused = jiffies - idle * HZ;
+
+ return 0;
+}
+
+static int
+mtk_eth_setup_tc_block_cb(enum tc_setup_type type, void *type_data, void *cb_priv)
+{
+ struct flow_cls_offload *cls = type_data;
+ struct net_device *dev = cb_priv;
+ struct mtk_mac *mac = netdev_priv(dev);
+ struct mtk_eth *eth = mac->hw;
+
+ if (!tc_can_offload(dev))
+ return -EOPNOTSUPP;
+
+ if (type != TC_SETUP_CLSFLOWER)
+ return -EOPNOTSUPP;
+
+ switch (cls->command) {
+ case FLOW_CLS_REPLACE:
+ return mtk_flow_offload_replace(eth, cls);
+ case FLOW_CLS_DESTROY:
+ return mtk_flow_offload_destroy(eth, cls);
+ case FLOW_CLS_STATS:
+ return mtk_flow_offload_stats(eth, cls);
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ return 0;
+}
+
+static int
+mtk_eth_setup_tc_block(struct net_device *dev, struct flow_block_offload *f)
+{
+ struct mtk_mac *mac = netdev_priv(dev);
+ struct mtk_eth *eth = mac->hw;
+ static LIST_HEAD(block_cb_list);
+ struct flow_block_cb *block_cb;
+ flow_setup_cb_t *cb;
+
+ if (!eth->ppe.foe_table)
+ return -EOPNOTSUPP;
+
+ if (f->binder_type != FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS)
+ return -EOPNOTSUPP;
+
+ cb = mtk_eth_setup_tc_block_cb;
+ f->driver_block_list = &block_cb_list;
+
+ switch (f->command) {
+ case FLOW_BLOCK_BIND:
+ block_cb = flow_block_cb_lookup(f->block, cb, dev);
+ if (block_cb) {
+ flow_block_cb_incref(block_cb);
+ return 0;
+ }
+ block_cb = flow_block_cb_alloc(cb, dev, dev, NULL);
+ if (IS_ERR(block_cb))
+ return PTR_ERR(block_cb);
+
+ flow_block_cb_add(block_cb, f);
+ list_add_tail(&block_cb->driver_list, &block_cb_list);
+ return 0;
+ case FLOW_BLOCK_UNBIND:
+ block_cb = flow_block_cb_lookup(f->block, cb, dev);
+ if (!block_cb)
+ return -ENOENT;
+
+ if (flow_block_cb_decref(block_cb)) {
+ flow_block_cb_remove(block_cb, f);
+ list_del(&block_cb->driver_list);
+ }
+ return 0;
+ default:
+ return -EOPNOTSUPP;
+ }
+}
+
+int mtk_eth_setup_tc(struct net_device *dev, enum tc_setup_type type,
+ void *type_data)
+{
+ if (type == TC_SETUP_FT)
+ return mtk_eth_setup_tc_block(dev, type_data);
+
+ return -EOPNOTSUPP;
+}
+
+int mtk_eth_offload_init(struct mtk_eth *eth)
+{
+ if (!eth->ppe.foe_table)
+ return 0;
+
+ return rhashtable_init(&eth->flow_table, &mtk_flow_ht_params);
+}

View File

@ -1,236 +0,0 @@
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Wed, 24 Mar 2021 02:30:55 +0100
Subject: [PATCH] docs: nf_flowtable: update documentation with
enhancements
This patch updates the flowtable documentation to describe recent
enhancements:
- Offload action is available after the first packets go through the
classic forwarding path.
- IPv4 and IPv6 are supported. Only TCP and UDP layer 4 are supported at
this stage.
- Tuple has been augmented to track VLAN id and PPPoE session id.
- Bridge and IP forwarding integration, including bridge VLAN filtering
support.
- Hardware offload support.
- Describe the [OFFLOAD] and [HW_OFFLOAD] tags in the conntrack table
listing.
- Replace 'flow offload' by 'flow add' in example rulesets (preferred
syntax).
- Describe existing cache limitations.
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
--- a/Documentation/networking/nf_flowtable.rst
+++ b/Documentation/networking/nf_flowtable.rst
@@ -4,35 +4,38 @@
Netfilter's flowtable infrastructure
====================================
-This documentation describes the software flowtable infrastructure available in
-Netfilter since Linux kernel 4.16.
+This documentation describes the Netfilter flowtable infrastructure which allows
+you to define a fastpath through the flowtable datapath. This infrastructure
+also provides hardware offload support. The flowtable supports for the layer 3
+IPv4 and IPv6 and the layer 4 TCP and UDP protocols.
Overview
--------
-Initial packets follow the classic forwarding path, once the flow enters the
-established state according to the conntrack semantics (ie. we have seen traffic
-in both directions), then you can decide to offload the flow to the flowtable
-from the forward chain via the 'flow offload' action available in nftables.
-
-Packets that find an entry in the flowtable (ie. flowtable hit) are sent to the
-output netdevice via neigh_xmit(), hence, they bypass the classic forwarding
-path (the visible effect is that you do not see these packets from any of the
-netfilter hooks coming after the ingress). In case of flowtable miss, the packet
-follows the classic forward path.
-
-The flowtable uses a resizable hashtable, lookups are based on the following
-7-tuple selectors: source, destination, layer 3 and layer 4 protocols, source
-and destination ports and the input interface (useful in case there are several
-conntrack zones in place).
-
-Flowtables are populated via the 'flow offload' nftables action, so the user can
-selectively specify what flows are placed into the flow table. Hence, packets
-follow the classic forwarding path unless the user explicitly instruct packets
-to use this new alternative forwarding path via nftables policy.
+Once the first packet of the flow successfully goes through the IP forwarding
+path, from the second packet on, you might decide to offload the flow to the
+flowtable through your ruleset. The flowtable infrastructure provides a rule
+action that allows you to specify when to add a flow to the flowtable.
+
+A packet that finds a matching entry in the flowtable (ie. flowtable hit) is
+transmitted to the output netdevice via neigh_xmit(), hence, packets bypass the
+classic IP forwarding path (the visible effect is that you do not see these
+packets from any of the Netfilter hooks coming after ingress). In case that
+there is no matching entry in the flowtable (ie. flowtable miss), the packet
+follows the classic IP forwarding path.
+
+The flowtable uses a resizable hashtable. Lookups are based on the following
+n-tuple selectors: layer 2 protocol encapsulation (VLAN and PPPoE), layer 3
+source and destination, layer 4 source and destination ports and the input
+interface (useful in case there are several conntrack zones in place).
+
+The 'flow add' action allows you to populate the flowtable, the user selectively
+specifies what flows are placed into the flowtable. Hence, packets follow the
+classic IP forwarding path unless the user explicitly instruct flows to use this
+new alternative forwarding path via policy.
-This is represented in Fig.1, which describes the classic forwarding path
-including the Netfilter hooks and the flowtable fastpath bypass.
+The flowtable datapath is represented in Fig.1, which describes the classic IP
+forwarding path including the Netfilter hooks and the flowtable fastpath bypass.
::
@@ -67,11 +70,13 @@ including the Netfilter hooks and the fl
Fig.1 Netfilter hooks and flowtable interactions
The flowtable entry also stores the NAT configuration, so all packets are
-mangled according to the NAT policy that matches the initial packets that went
-through the classic forwarding path. The TTL is decremented before calling
-neigh_xmit(). Fragmented traffic is passed up to follow the classic forwarding
-path given that the transport selectors are missing, therefore flowtable lookup
-is not possible.
+mangled according to the NAT policy that is specified from the classic IP
+forwarding path. The TTL is decremented before calling neigh_xmit(). Fragmented
+traffic is passed up to follow the classic IP forwarding path given that the
+transport header is missing, in this case, flowtable lookups are not possible.
+TCP RST and FIN packets are also passed up to the classic IP forwarding path to
+release the flow gracefully. Packets that exceed the MTU are also passed up to
+the classic forwarding path to report packet-too-big ICMP errors to the sender.
Example configuration
---------------------
@@ -85,7 +90,7 @@ flowtable and add one rule to your forwa
}
chain y {
type filter hook forward priority 0; policy accept;
- ip protocol tcp flow offload @f
+ ip protocol tcp flow add @f
counter packets 0 bytes 0
}
}
@@ -103,6 +108,117 @@ flow is offloaded, you will observe that
does not get updated for the packets that are being forwarded through the
forwarding bypass.
+You can identify offloaded flows through the [OFFLOAD] tag when listing your
+connection tracking table.
+
+::
+ # conntrack -L
+ tcp 6 src=10.141.10.2 dst=192.168.10.2 sport=52728 dport=5201 src=192.168.10.2 dst=192.168.10.1 sport=5201 dport=52728 [OFFLOAD] mark=0 use=2
+
+
+Layer 2 encapsulation
+---------------------
+
+Since Linux kernel 5.13, the flowtable infrastructure discovers the real
+netdevice behind VLAN and PPPoE netdevices. The flowtable software datapath
+parses the VLAN and PPPoE layer 2 headers to extract the ethertype and the
+VLAN ID / PPPoE session ID which are used for the flowtable lookups. The
+flowtable datapath also deals with layer 2 decapsulation.
+
+You do not need to add the PPPoE and the VLAN devices to your flowtable,
+instead the real device is sufficient for the flowtable to track your flows.
+
+Bridge and IP forwarding
+------------------------
+
+Since Linux kernel 5.13, you can add bridge ports to the flowtable. The
+flowtable infrastructure discovers the topology behind the bridge device. This
+allows the flowtable to define a fastpath bypass between the bridge ports
+(represented as eth1 and eth2 in the example figure below) and the gateway
+device (represented as eth0) in your switch/router.
+
+::
+ fastpath bypass
+ .-------------------------.
+ / \
+ | IP forwarding |
+ | / \ \/
+ | br0 eth0 ..... eth0
+ . / \ *host B*
+ -> eth1 eth2
+ . *switch/router*
+ .
+ .
+ eth0
+ *host A*
+
+The flowtable infrastructure also supports for bridge VLAN filtering actions
+such as PVID and untagged. You can also stack a classic VLAN device on top of
+your bridge port.
+
+If you would like that your flowtable defines a fastpath between your bridge
+ports and your IP forwarding path, you have to add your bridge ports (as
+represented by the real netdevice) to your flowtable definition.
+
+Counters
+--------
+
+The flowtable can synchronize packet and byte counters with the existing
+connection tracking entry by specifying the counter statement in your flowtable
+definition, e.g.
+
+::
+ table inet x {
+ flowtable f {
+ hook ingress priority 0; devices = { eth0, eth1 };
+ counter
+ }
+ ...
+ }
+
+Counter support is available since Linux kernel 5.7.
+
+Hardware offload
+----------------
+
+If your network device provides hardware offload support, you can turn it on by
+means of the 'offload' flag in your flowtable definition, e.g.
+
+::
+ table inet x {
+ flowtable f {
+ hook ingress priority 0; devices = { eth0, eth1 };
+ flags offload;
+ }
+ ...
+ }
+
+There is a workqueue that adds the flows to the hardware. Note that a few
+packets might still run over the flowtable software path until the workqueue has
+a chance to offload the flow to the network device.
+
+You can identify hardware offloaded flows through the [HW_OFFLOAD] tag when
+listing your connection tracking table. Please, note that the [OFFLOAD] tag
+refers to the software offload mode, so there is a distinction between [OFFLOAD]
+which refers to the software flowtable fastpath and [HW_OFFLOAD] which refers
+to the hardware offload datapath being used by the flow.
+
+The flowtable hardware offload infrastructure also supports for the DSA
+(Distributed Switch Architecture).
+
+Limitations
+-----------
+
+The flowtable behaves like a cache. The flowtable entries might get stale if
+either the destination MAC address or the egress netdevice that is used for
+transmission changes.
+
+This might be a problem if:
+
+- You run the flowtable in software mode and you combine bridge and IP
+ forwarding in your setup.
+- Hardware offload is enabled.
+
More reading
------------

View File

@ -1,72 +0,0 @@
From c5d66587b8900201e1530b7c18d41e87bd5812f4 Mon Sep 17 00:00:00 2001
From: Ilya Lipnitskiy <ilya.lipnitskiy@gmail.com>
Date: Thu, 15 Apr 2021 17:37:48 -0700
Subject: [PATCH] net: ethernet: mediatek: ppe: fix busy wait loop
The intention is for the loop to timeout if the body does not succeed.
The current logic calls time_is_before_jiffies(timeout) which is false
until after the timeout, so the loop body never executes.
Fix by using readl_poll_timeout as a more standard and less error-prone
solution.
Fixes: ba37b7caf1ed ("net: ethernet: mtk_eth_soc: add support for initializing the PPE")
Signed-off-by: Ilya Lipnitskiy <ilya.lipnitskiy@gmail.com>
Cc: Felix Fietkau <nbd@nbd.name>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
drivers/net/ethernet/mediatek/mtk_ppe.c | 20 +++++++++-----------
drivers/net/ethernet/mediatek/mtk_ppe.h | 1 +
2 files changed, 10 insertions(+), 11 deletions(-)
--- a/drivers/net/ethernet/mediatek/mtk_ppe.c
+++ b/drivers/net/ethernet/mediatek/mtk_ppe.c
@@ -2,9 +2,8 @@
/* Copyright (C) 2020 Felix Fietkau <nbd@nbd.name> */
#include <linux/kernel.h>
-#include <linux/jiffies.h>
-#include <linux/delay.h>
#include <linux/io.h>
+#include <linux/iopoll.h>
#include <linux/etherdevice.h>
#include <linux/platform_device.h>
#include "mtk_ppe.h"
@@ -44,18 +43,17 @@ static u32 ppe_clear(struct mtk_ppe *ppe
static int mtk_ppe_wait_busy(struct mtk_ppe *ppe)
{
- unsigned long timeout = jiffies + HZ;
-
- while (time_is_before_jiffies(timeout)) {
- if (!(ppe_r32(ppe, MTK_PPE_GLO_CFG) & MTK_PPE_GLO_CFG_BUSY))
- return 0;
+ int ret;
+ u32 val;
- usleep_range(10, 20);
- }
+ ret = readl_poll_timeout(ppe->base + MTK_PPE_GLO_CFG, val,
+ !(val & MTK_PPE_GLO_CFG_BUSY),
+ 20, MTK_PPE_WAIT_TIMEOUT_US);
- dev_err(ppe->dev, "PPE table busy");
+ if (ret)
+ dev_err(ppe->dev, "PPE table busy");
- return -ETIMEDOUT;
+ return ret;
}
static void mtk_ppe_cache_clear(struct mtk_ppe *ppe)
--- a/drivers/net/ethernet/mediatek/mtk_ppe.h
+++ b/drivers/net/ethernet/mediatek/mtk_ppe.h
@@ -12,6 +12,7 @@
#define MTK_PPE_ENTRIES_SHIFT 3
#define MTK_PPE_ENTRIES (1024 << MTK_PPE_ENTRIES_SHIFT)
#define MTK_PPE_HASH_MASK (MTK_PPE_ENTRIES - 1)
+#define MTK_PPE_WAIT_TIMEOUT_US 1000000
#define MTK_FOE_IB1_UNBIND_TIMESTAMP GENMASK(7, 0)
#define MTK_FOE_IB1_UNBIND_PACKETS GENMASK(23, 8)

View File

@ -1,29 +0,0 @@
From 6ecaf81d4ac6365f9284f9d68d74f7c209e74f98 Mon Sep 17 00:00:00 2001
From: DENG Qingfang <dqfext@gmail.com>
Date: Sat, 17 Apr 2021 15:29:04 +0800
Subject: [PATCH] net: ethernet: mediatek: fix a typo bug in flow offloading
Issue was traffic problems after a while with increased ping times if
flow offload is active. It turns out that key_offset with cookie is
needed in rhashtable_params but was re-assigned to head_offset.
Fix the assignment.
Fixes: 502e84e2382d ("net: ethernet: mtk_eth_soc: add flow offloading support")
Signed-off-by: DENG Qingfang <dqfext@gmail.com>
Tested-by: Frank Wunderlich <frank-w@public-files.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
drivers/net/ethernet/mediatek/mtk_ppe_offload.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/drivers/net/ethernet/mediatek/mtk_ppe_offload.c
+++ b/drivers/net/ethernet/mediatek/mtk_ppe_offload.c
@@ -44,7 +44,7 @@ struct mtk_flow_entry {
static const struct rhashtable_params mtk_flow_ht_params = {
.head_offset = offsetof(struct mtk_flow_entry, node),
- .head_offset = offsetof(struct mtk_flow_entry, cookie),
+ .key_offset = offsetof(struct mtk_flow_entry, cookie),
.key_len = sizeof(unsigned long),
.automatic_shrinking = true,
};

View File

@ -1,38 +0,0 @@
From 5196c417854942e218a59ec87bf7d414b3bd581e Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Thu, 22 Apr 2021 22:20:55 -0700
Subject: [PATCH] net: ethernet: mtk_eth_soc: unmap RX data before calling
build_skb
Since build_skb accesses the data area (for initializing shinfo), dma unmap
needs to happen before that call
Signed-off-by: Felix Fietkau <nbd@nbd.name>
[Ilya: split build_skb cleanup fix into a separate commit]
Signed-off-by: Ilya Lipnitskiy <ilya.lipnitskiy@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
drivers/net/ethernet/mediatek/mtk_eth_soc.c | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
--- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c
+++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
@@ -1319,6 +1319,9 @@ static int mtk_poll_rx(struct napi_struc
goto release_desc;
}
+ dma_unmap_single(eth->dev, trxd.rxd1,
+ ring->buf_size, DMA_FROM_DEVICE);
+
/* receive data */
skb = build_skb(data, ring->frag_size);
if (unlikely(!skb)) {
@@ -1328,8 +1331,6 @@ static int mtk_poll_rx(struct napi_struc
}
skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
- dma_unmap_single(eth->dev, trxd.rxd1,
- ring->buf_size, DMA_FROM_DEVICE);
pktlen = RX_DMA_GET_PLEN0(trxd.rxd2);
skb->dev = netdev;
skb_put(skb, pktlen);

View File

@ -1,38 +0,0 @@
From 787082ab9f7be4711e52f67c388535eda74a1269 Mon Sep 17 00:00:00 2001
From: Ilya Lipnitskiy <ilya.lipnitskiy@gmail.com>
Date: Thu, 22 Apr 2021 22:20:56 -0700
Subject: [PATCH] net: ethernet: mtk_eth_soc: fix build_skb cleanup
In case build_skb fails, call skb_free_frag on the correct pointer. Also
update the DMA structures with the new mapping before exiting, because
the mapping was successful
Suggested-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Ilya Lipnitskiy <ilya.lipnitskiy@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
drivers/net/ethernet/mediatek/mtk_eth_soc.c | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
--- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c
+++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
@@ -1325,9 +1325,9 @@ static int mtk_poll_rx(struct napi_struc
/* receive data */
skb = build_skb(data, ring->frag_size);
if (unlikely(!skb)) {
- skb_free_frag(new_data);
+ skb_free_frag(data);
netdev->stats.rx_dropped++;
- goto release_desc;
+ goto skip_rx;
}
skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
@@ -1347,6 +1347,7 @@ static int mtk_poll_rx(struct napi_struc
skb_record_rx_queue(skb, 0);
napi_gro_receive(napi, skb);
+skip_rx:
ring->data[idx] = new_data;
rxd->rxd1 = (unsigned int)dma_addr;

View File

@ -1,77 +0,0 @@
From c30c4a82739090a2de4a4e3f245355ea4fb3ec14 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Thu, 22 Apr 2021 22:20:57 -0700
Subject: [PATCH] net: ethernet: mtk_eth_soc: use napi_consume_skb
Should improve performance, since it can use bulk free
Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Ilya Lipnitskiy <ilya.lipnitskiy@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
drivers/net/ethernet/mediatek/mtk_eth_soc.c | 19 ++++++++++++-------
1 file changed, 12 insertions(+), 7 deletions(-)
--- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c
+++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
@@ -879,7 +879,8 @@ static int txd_to_idx(struct mtk_tx_ring
return ((void *)dma - (void *)ring->dma) / sizeof(*dma);
}
-static void mtk_tx_unmap(struct mtk_eth *eth, struct mtk_tx_buf *tx_buf)
+static void mtk_tx_unmap(struct mtk_eth *eth, struct mtk_tx_buf *tx_buf,
+ bool napi)
{
if (MTK_HAS_CAPS(eth->soc->caps, MTK_QDMA)) {
if (tx_buf->flags & MTK_TX_FLAGS_SINGLE0) {
@@ -911,8 +912,12 @@ static void mtk_tx_unmap(struct mtk_eth
tx_buf->flags = 0;
if (tx_buf->skb &&
- (tx_buf->skb != (struct sk_buff *)MTK_DMA_DUMMY_DESC))
- dev_kfree_skb_any(tx_buf->skb);
+ (tx_buf->skb != (struct sk_buff *)MTK_DMA_DUMMY_DESC)) {
+ if (napi)
+ napi_consume_skb(tx_buf->skb, napi);
+ else
+ dev_kfree_skb_any(tx_buf->skb);
+ }
tx_buf->skb = NULL;
}
@@ -1090,7 +1095,7 @@ err_dma:
tx_buf = mtk_desc_to_tx_buf(ring, itxd);
/* unmap dma */
- mtk_tx_unmap(eth, tx_buf);
+ mtk_tx_unmap(eth, tx_buf, false);
itxd->txd3 = TX_DMA_LS0 | TX_DMA_OWNER_CPU;
if (!MTK_HAS_CAPS(eth->soc->caps, MTK_QDMA))
@@ -1409,7 +1414,7 @@ static int mtk_poll_tx_qdma(struct mtk_e
done[mac]++;
budget--;
}
- mtk_tx_unmap(eth, tx_buf);
+ mtk_tx_unmap(eth, tx_buf, true);
ring->last_free = desc;
atomic_inc(&ring->free_count);
@@ -1446,7 +1451,7 @@ static int mtk_poll_tx_pdma(struct mtk_e
budget--;
}
- mtk_tx_unmap(eth, tx_buf);
+ mtk_tx_unmap(eth, tx_buf, true);
desc = &ring->dma[cpu];
ring->last_free = desc;
@@ -1648,7 +1653,7 @@ static void mtk_tx_clean(struct mtk_eth
if (ring->buf) {
for (i = 0; i < MTK_DMA_SIZE; i++)
- mtk_tx_unmap(eth, &ring->buf[i]);
+ mtk_tx_unmap(eth, &ring->buf[i], false);
kfree(ring->buf);
ring->buf = NULL;
}

View File

@ -1,30 +0,0 @@
From 3630d519d7c3eab92567658690e44ffe0517d109 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Thu, 22 Apr 2021 22:20:58 -0700
Subject: [PATCH] net: ethernet: mtk_eth_soc: reduce MDIO bus access latency
usleep_range often ends up sleeping much longer than the 10-20us provided
as a range here. This causes significant latency in mdio bus acceses,
which easily adds multiple seconds to the boot time on MT7621 when polling
DSA slave ports.
Use cond_resched instead of usleep_range, since the MDIO access does not
take much time
Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Ilya Lipnitskiy <ilya.lipnitskiy@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
drivers/net/ethernet/mediatek/mtk_eth_soc.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c
+++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
@@ -86,7 +86,7 @@ static int mtk_mdio_busy_wait(struct mtk
return 0;
if (time_after(jiffies, t_start + PHY_IAC_TIMEOUT))
break;
- usleep_range(10, 20);
+ cond_resched();
}
dev_err(eth->dev, "mdio: MDIO timeout\n");

View File

@ -1,54 +0,0 @@
From 16ef670789b252b221700adc413497ed2f941d8a Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Thu, 22 Apr 2021 22:20:59 -0700
Subject: [PATCH] net: ethernet: mtk_eth_soc: remove unnecessary TX queue stops
When running short on descriptors, only stop the queue for the netdev that
tx was attempted for. By the time something tries to send on the other
netdev, the ring might have some more room already.
Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Ilya Lipnitskiy <ilya.lipnitskiy@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
drivers/net/ethernet/mediatek/mtk_eth_soc.c | 15 ++-------------
1 file changed, 2 insertions(+), 13 deletions(-)
--- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c
+++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
@@ -1152,17 +1152,6 @@ static void mtk_wake_queue(struct mtk_et
}
}
-static void mtk_stop_queue(struct mtk_eth *eth)
-{
- int i;
-
- for (i = 0; i < MTK_MAC_COUNT; i++) {
- if (!eth->netdev[i])
- continue;
- netif_stop_queue(eth->netdev[i]);
- }
-}
-
static netdev_tx_t mtk_start_xmit(struct sk_buff *skb, struct net_device *dev)
{
struct mtk_mac *mac = netdev_priv(dev);
@@ -1183,7 +1172,7 @@ static netdev_tx_t mtk_start_xmit(struct
tx_num = mtk_cal_txd_req(skb);
if (unlikely(atomic_read(&ring->free_count) <= tx_num)) {
- mtk_stop_queue(eth);
+ netif_stop_queue(dev);
netif_err(eth, tx_queued, dev,
"Tx Ring full when queue awake!\n");
spin_unlock(&eth->page_lock);
@@ -1209,7 +1198,7 @@ static netdev_tx_t mtk_start_xmit(struct
goto drop;
if (unlikely(atomic_read(&ring->free_count) <= ring->thresh))
- mtk_stop_queue(eth);
+ netif_stop_queue(dev);
spin_unlock(&eth->page_lock);

View File

@ -1,37 +0,0 @@
From 59555a8d0dd39bf60b7ca1ba5e7393d293f7398d Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Thu, 22 Apr 2021 22:21:00 -0700
Subject: [PATCH] net: ethernet: mtk_eth_soc: use larger burst size for QDMA TX
Improves tx performance
Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Ilya Lipnitskiy <ilya.lipnitskiy@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
drivers/net/ethernet/mediatek/mtk_eth_soc.c | 2 +-
drivers/net/ethernet/mediatek/mtk_eth_soc.h | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
--- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c
+++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
@@ -2214,7 +2214,7 @@ static int mtk_start_dma(struct mtk_eth
if (MTK_HAS_CAPS(eth->soc->caps, MTK_QDMA)) {
mtk_w32(eth,
MTK_TX_WB_DDONE | MTK_TX_DMA_EN |
- MTK_DMA_SIZE_16DWORDS | MTK_NDP_CO_PRO |
+ MTK_TX_BT_32DWORDS | MTK_NDP_CO_PRO |
MTK_RX_DMA_EN | MTK_RX_2B_OFFSET |
MTK_RX_BT_32DWORDS,
MTK_QDMA_GLO_CFG);
--- a/drivers/net/ethernet/mediatek/mtk_eth_soc.h
+++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.h
@@ -202,7 +202,7 @@
#define MTK_RX_BT_32DWORDS (3 << 11)
#define MTK_NDP_CO_PRO BIT(10)
#define MTK_TX_WB_DDONE BIT(6)
-#define MTK_DMA_SIZE_16DWORDS (2 << 4)
+#define MTK_TX_BT_32DWORDS (3 << 4)
#define MTK_RX_DMA_BUSY BIT(3)
#define MTK_TX_DMA_BUSY BIT(1)
#define MTK_RX_DMA_EN BIT(2)

View File

@ -1,26 +0,0 @@
From 6b4423b258b91032c50a5efca15d3d9bb194ea1d Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Thu, 22 Apr 2021 22:21:01 -0700
Subject: [PATCH] net: ethernet: mtk_eth_soc: increase DMA ring sizes
256 descriptors is not enough for multi-gigabit traffic under load on
MT7622. Bump it to 512 to improve performance.
Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Ilya Lipnitskiy <ilya.lipnitskiy@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
drivers/net/ethernet/mediatek/mtk_eth_soc.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/drivers/net/ethernet/mediatek/mtk_eth_soc.h
+++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.h
@@ -21,7 +21,7 @@
#define MTK_QDMA_PAGE_SIZE 2048
#define MTK_MAX_RX_LENGTH 1536
#define MTK_TX_DMA_BUF_LEN 0x3fff
-#define MTK_DMA_SIZE 256
+#define MTK_DMA_SIZE 512
#define MTK_NAPI_WEIGHT 64
#define MTK_MAC_COUNT 2
#define MTK_RX_ETH_HLEN (VLAN_ETH_HLEN + VLAN_HLEN + ETH_FCS_LEN)

View File

@ -1,313 +0,0 @@
From e9229ffd550b2d8c4997c67a501dbc3919fd4e26 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Thu, 22 Apr 2021 22:21:02 -0700
Subject: [PATCH] net: ethernet: mtk_eth_soc: implement dynamic interrupt
moderation
Reduces the number of interrupts under load
Signed-off-by: Felix Fietkau <nbd@nbd.name>
[Ilya: add documentation for new struct fields]
Signed-off-by: Ilya Lipnitskiy <ilya.lipnitskiy@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
drivers/net/ethernet/mediatek/Kconfig | 1 +
drivers/net/ethernet/mediatek/mtk_eth_soc.c | 96 +++++++++++++++++++--
drivers/net/ethernet/mediatek/mtk_eth_soc.h | 41 +++++++--
3 files changed, 124 insertions(+), 14 deletions(-)
--- a/drivers/net/ethernet/mediatek/Kconfig
+++ b/drivers/net/ethernet/mediatek/Kconfig
@@ -10,6 +10,7 @@ if NET_VENDOR_MEDIATEK
config NET_MEDIATEK_SOC
tristate "MediaTek SoC Gigabit Ethernet support"
select PHYLINK
+ select DIMLIB
help
This driver supports the gigabit ethernet MACs in the
MediaTek SoC family.
--- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c
+++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
@@ -1254,12 +1254,13 @@ static void mtk_update_rx_cpu_idx(struct
static int mtk_poll_rx(struct napi_struct *napi, int budget,
struct mtk_eth *eth)
{
+ struct dim_sample dim_sample = {};
struct mtk_rx_ring *ring;
int idx;
struct sk_buff *skb;
u8 *data, *new_data;
struct mtk_rx_dma *rxd, trxd;
- int done = 0;
+ int done = 0, bytes = 0;
while (done < budget) {
struct net_device *netdev;
@@ -1333,6 +1334,7 @@ static int mtk_poll_rx(struct napi_struc
else
skb_checksum_none_assert(skb);
skb->protocol = eth_type_trans(skb, netdev);
+ bytes += pktlen;
if (netdev->features & NETIF_F_HW_VLAN_CTAG_RX &&
(trxd.rxd2 & RX_DMA_VTAG))
@@ -1365,6 +1367,12 @@ rx_done:
mtk_update_rx_cpu_idx(eth);
}
+ eth->rx_packets += done;
+ eth->rx_bytes += bytes;
+ dim_update_sample(eth->rx_events, eth->rx_packets, eth->rx_bytes,
+ &dim_sample);
+ net_dim(&eth->rx_dim, dim_sample);
+
return done;
}
@@ -1457,6 +1465,7 @@ static int mtk_poll_tx_pdma(struct mtk_e
static int mtk_poll_tx(struct mtk_eth *eth, int budget)
{
struct mtk_tx_ring *ring = &eth->tx_ring;
+ struct dim_sample dim_sample = {};
unsigned int done[MTK_MAX_DEVS];
unsigned int bytes[MTK_MAX_DEVS];
int total = 0, i;
@@ -1474,8 +1483,14 @@ static int mtk_poll_tx(struct mtk_eth *e
continue;
netdev_completed_queue(eth->netdev[i], done[i], bytes[i]);
total += done[i];
+ eth->tx_packets += done[i];
+ eth->tx_bytes += bytes[i];
}
+ dim_update_sample(eth->tx_events, eth->tx_packets, eth->tx_bytes,
+ &dim_sample);
+ net_dim(&eth->tx_dim, dim_sample);
+
if (mtk_queue_stopped(eth) &&
(atomic_read(&ring->free_count) > ring->thresh))
mtk_wake_queue(eth);
@@ -2150,6 +2165,7 @@ static irqreturn_t mtk_handle_irq_rx(int
{
struct mtk_eth *eth = _eth;
+ eth->rx_events++;
if (likely(napi_schedule_prep(&eth->rx_napi))) {
__napi_schedule(&eth->rx_napi);
mtk_rx_irq_disable(eth, MTK_RX_DONE_INT);
@@ -2162,6 +2178,7 @@ static irqreturn_t mtk_handle_irq_tx(int
{
struct mtk_eth *eth = _eth;
+ eth->tx_events++;
if (likely(napi_schedule_prep(&eth->tx_napi))) {
__napi_schedule(&eth->tx_napi);
mtk_tx_irq_disable(eth, MTK_TX_DONE_INT);
@@ -2346,6 +2363,9 @@ static int mtk_stop(struct net_device *d
napi_disable(&eth->tx_napi);
napi_disable(&eth->rx_napi);
+ cancel_work_sync(&eth->rx_dim.work);
+ cancel_work_sync(&eth->tx_dim.work);
+
if (MTK_HAS_CAPS(eth->soc->caps, MTK_QDMA))
mtk_stop_dma(eth, MTK_QDMA_GLO_CFG);
mtk_stop_dma(eth, MTK_PDMA_GLO_CFG);
@@ -2398,6 +2418,64 @@ err_disable_clks:
return ret;
}
+static void mtk_dim_rx(struct work_struct *work)
+{
+ struct dim *dim = container_of(work, struct dim, work);
+ struct mtk_eth *eth = container_of(dim, struct mtk_eth, rx_dim);
+ struct dim_cq_moder cur_profile;
+ u32 val, cur;
+
+ cur_profile = net_dim_get_rx_moderation(eth->rx_dim.mode,
+ dim->profile_ix);
+ spin_lock_bh(&eth->dim_lock);
+
+ val = mtk_r32(eth, MTK_PDMA_DELAY_INT);
+ val &= MTK_PDMA_DELAY_TX_MASK;
+ val |= MTK_PDMA_DELAY_RX_EN;
+
+ cur = min_t(u32, DIV_ROUND_UP(cur_profile.usec, 20), MTK_PDMA_DELAY_PTIME_MASK);
+ val |= cur << MTK_PDMA_DELAY_RX_PTIME_SHIFT;
+
+ cur = min_t(u32, cur_profile.pkts, MTK_PDMA_DELAY_PINT_MASK);
+ val |= cur << MTK_PDMA_DELAY_RX_PINT_SHIFT;
+
+ mtk_w32(eth, val, MTK_PDMA_DELAY_INT);
+ mtk_w32(eth, val, MTK_QDMA_DELAY_INT);
+
+ spin_unlock_bh(&eth->dim_lock);
+
+ dim->state = DIM_START_MEASURE;
+}
+
+static void mtk_dim_tx(struct work_struct *work)
+{
+ struct dim *dim = container_of(work, struct dim, work);
+ struct mtk_eth *eth = container_of(dim, struct mtk_eth, tx_dim);
+ struct dim_cq_moder cur_profile;
+ u32 val, cur;
+
+ cur_profile = net_dim_get_tx_moderation(eth->tx_dim.mode,
+ dim->profile_ix);
+ spin_lock_bh(&eth->dim_lock);
+
+ val = mtk_r32(eth, MTK_PDMA_DELAY_INT);
+ val &= MTK_PDMA_DELAY_RX_MASK;
+ val |= MTK_PDMA_DELAY_TX_EN;
+
+ cur = min_t(u32, DIV_ROUND_UP(cur_profile.usec, 20), MTK_PDMA_DELAY_PTIME_MASK);
+ val |= cur << MTK_PDMA_DELAY_TX_PTIME_SHIFT;
+
+ cur = min_t(u32, cur_profile.pkts, MTK_PDMA_DELAY_PINT_MASK);
+ val |= cur << MTK_PDMA_DELAY_TX_PINT_SHIFT;
+
+ mtk_w32(eth, val, MTK_PDMA_DELAY_INT);
+ mtk_w32(eth, val, MTK_QDMA_DELAY_INT);
+
+ spin_unlock_bh(&eth->dim_lock);
+
+ dim->state = DIM_START_MEASURE;
+}
+
static int mtk_hw_init(struct mtk_eth *eth)
{
int i, val, ret;
@@ -2419,9 +2497,6 @@ static int mtk_hw_init(struct mtk_eth *e
goto err_disable_pm;
}
- /* enable interrupt delay for RX */
- mtk_w32(eth, MTK_PDMA_DELAY_RX_DELAY, MTK_PDMA_DELAY_INT);
-
/* disable delay and normal interrupt */
mtk_tx_irq_disable(eth, ~0);
mtk_rx_irq_disable(eth, ~0);
@@ -2460,11 +2535,11 @@ static int mtk_hw_init(struct mtk_eth *e
/* Enable RX VLan Offloading */
mtk_w32(eth, 1, MTK_CDMP_EG_CTRL);
- /* enable interrupt delay for RX */
- mtk_w32(eth, MTK_PDMA_DELAY_RX_DELAY, MTK_PDMA_DELAY_INT);
+ /* set interrupt delays based on current Net DIM sample */
+ mtk_dim_rx(&eth->rx_dim.work);
+ mtk_dim_tx(&eth->tx_dim.work);
/* disable delay and normal interrupt */
- mtk_w32(eth, 0, MTK_QDMA_DELAY_INT);
mtk_tx_irq_disable(eth, ~0);
mtk_rx_irq_disable(eth, ~0);
@@ -2969,6 +3044,13 @@ static int mtk_probe(struct platform_dev
spin_lock_init(&eth->page_lock);
spin_lock_init(&eth->tx_irq_lock);
spin_lock_init(&eth->rx_irq_lock);
+ spin_lock_init(&eth->dim_lock);
+
+ eth->rx_dim.mode = DIM_CQ_PERIOD_MODE_START_FROM_EQE;
+ INIT_WORK(&eth->rx_dim.work, mtk_dim_rx);
+
+ eth->tx_dim.mode = DIM_CQ_PERIOD_MODE_START_FROM_EQE;
+ INIT_WORK(&eth->tx_dim.work, mtk_dim_tx);
if (!MTK_HAS_CAPS(eth->soc->caps, MTK_SOC_MT7628)) {
eth->ethsys = syscon_regmap_lookup_by_phandle(pdev->dev.of_node,
--- a/drivers/net/ethernet/mediatek/mtk_eth_soc.h
+++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.h
@@ -16,6 +16,7 @@
#include <linux/refcount.h>
#include <linux/phylink.h>
#include <linux/rhashtable.h>
+#include <linux/dim.h>
#include "mtk_ppe.h"
#define MTK_QDMA_PAGE_SIZE 2048
@@ -136,13 +137,18 @@
/* PDMA Delay Interrupt Register */
#define MTK_PDMA_DELAY_INT 0xa0c
+#define MTK_PDMA_DELAY_RX_MASK GENMASK(15, 0)
#define MTK_PDMA_DELAY_RX_EN BIT(15)
-#define MTK_PDMA_DELAY_RX_PINT 4
#define MTK_PDMA_DELAY_RX_PINT_SHIFT 8
-#define MTK_PDMA_DELAY_RX_PTIME 4
-#define MTK_PDMA_DELAY_RX_DELAY \
- (MTK_PDMA_DELAY_RX_EN | MTK_PDMA_DELAY_RX_PTIME | \
- (MTK_PDMA_DELAY_RX_PINT << MTK_PDMA_DELAY_RX_PINT_SHIFT))
+#define MTK_PDMA_DELAY_RX_PTIME_SHIFT 0
+
+#define MTK_PDMA_DELAY_TX_MASK GENMASK(31, 16)
+#define MTK_PDMA_DELAY_TX_EN BIT(31)
+#define MTK_PDMA_DELAY_TX_PINT_SHIFT 24
+#define MTK_PDMA_DELAY_TX_PTIME_SHIFT 16
+
+#define MTK_PDMA_DELAY_PINT_MASK 0x7f
+#define MTK_PDMA_DELAY_PTIME_MASK 0xff
/* PDMA Interrupt Status Register */
#define MTK_PDMA_INT_STATUS 0xa20
@@ -224,6 +230,7 @@
/* QDMA Interrupt Status Register */
#define MTK_QDMA_INT_STATUS 0x1A18
#define MTK_RX_DONE_DLY BIT(30)
+#define MTK_TX_DONE_DLY BIT(28)
#define MTK_RX_DONE_INT3 BIT(19)
#define MTK_RX_DONE_INT2 BIT(18)
#define MTK_RX_DONE_INT1 BIT(17)
@@ -233,8 +240,7 @@
#define MTK_TX_DONE_INT1 BIT(1)
#define MTK_TX_DONE_INT0 BIT(0)
#define MTK_RX_DONE_INT MTK_RX_DONE_DLY
-#define MTK_TX_DONE_INT (MTK_TX_DONE_INT0 | MTK_TX_DONE_INT1 | \
- MTK_TX_DONE_INT2 | MTK_TX_DONE_INT3)
+#define MTK_TX_DONE_INT MTK_TX_DONE_DLY
/* QDMA Interrupt grouping registers */
#define MTK_QDMA_INT_GRP1 0x1a20
@@ -863,6 +869,7 @@ struct mtk_sgmii {
* @page_lock: Make sure that register operations are atomic
* @tx_irq__lock: Make sure that IRQ register operations are atomic
* @rx_irq__lock: Make sure that IRQ register operations are atomic
+ * @dim_lock: Make sure that Net DIM operations are atomic
* @dummy_dev: we run 2 netdevs on 1 physical DMA ring and need a
* dummy for NAPI to work
* @netdev: The netdev instances
@@ -881,6 +888,14 @@ struct mtk_sgmii {
* @rx_ring_qdma: Pointer to the memory holding info about the QDMA RX ring
* @tx_napi: The TX NAPI struct
* @rx_napi: The RX NAPI struct
+ * @rx_events: Net DIM RX event counter
+ * @rx_packets: Net DIM RX packet counter
+ * @rx_bytes: Net DIM RX byte counter
+ * @rx_dim: Net DIM RX context
+ * @tx_events: Net DIM TX event counter
+ * @tx_packets: Net DIM TX packet counter
+ * @tx_bytes: Net DIM TX byte counter
+ * @tx_dim: Net DIM TX context
* @scratch_ring: Newer SoCs need memory for a second HW managed TX ring
* @phy_scratch_ring: physical address of scratch_ring
* @scratch_head: The scratch memory that scratch_ring points to.
@@ -925,6 +940,18 @@ struct mtk_eth {
const struct mtk_soc_data *soc;
+ spinlock_t dim_lock;
+
+ u32 rx_events;
+ u32 rx_packets;
+ u32 rx_bytes;
+ struct dim rx_dim;
+
+ u32 tx_events;
+ u32 tx_packets;
+ u32 tx_bytes;
+ struct dim tx_dim;
+
u32 tx_int_mask_reg;
u32 tx_int_status_reg;
u32 rx_dma_l4_valid;

View File

@ -1,73 +0,0 @@
From 4e6bf609569c59b6bd6acf4a607c096cbd820d79 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Thu, 22 Apr 2021 22:21:03 -0700
Subject: [PATCH] net: ethernet: mtk_eth_soc: cache HW pointer of last freed TX
descriptor
The value is only updated by the CPU, so it is cheaper to access from the
ring data structure than from a hardware register.
Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Ilya Lipnitskiy <ilya.lipnitskiy@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
drivers/net/ethernet/mediatek/mtk_eth_soc.c | 8 ++++----
drivers/net/ethernet/mediatek/mtk_eth_soc.h | 2 ++
2 files changed, 6 insertions(+), 4 deletions(-)
--- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c
+++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
@@ -1385,7 +1385,7 @@ static int mtk_poll_tx_qdma(struct mtk_e
struct mtk_tx_buf *tx_buf;
u32 cpu, dma;
- cpu = mtk_r32(eth, MTK_QTX_CRX_PTR);
+ cpu = ring->last_free_ptr;
dma = mtk_r32(eth, MTK_QTX_DRX_PTR);
desc = mtk_qdma_phys_to_virt(ring, cpu);
@@ -1419,6 +1419,7 @@ static int mtk_poll_tx_qdma(struct mtk_e
cpu = next_cpu;
}
+ ring->last_free_ptr = cpu;
mtk_w32(eth, cpu, MTK_QTX_CRX_PTR);
return budget;
@@ -1619,6 +1620,7 @@ static int mtk_tx_alloc(struct mtk_eth *
atomic_set(&ring->free_count, MTK_DMA_SIZE - 2);
ring->next_free = &ring->dma[0];
ring->last_free = &ring->dma[MTK_DMA_SIZE - 1];
+ ring->last_free_ptr = (u32)(ring->phys + ((MTK_DMA_SIZE - 1) * sz));
ring->thresh = MAX_SKB_FRAGS;
/* make sure that all changes to the dma ring are flushed before we
@@ -1632,9 +1634,7 @@ static int mtk_tx_alloc(struct mtk_eth *
mtk_w32(eth,
ring->phys + ((MTK_DMA_SIZE - 1) * sz),
MTK_QTX_CRX_PTR);
- mtk_w32(eth,
- ring->phys + ((MTK_DMA_SIZE - 1) * sz),
- MTK_QTX_DRX_PTR);
+ mtk_w32(eth, ring->last_free_ptr, MTK_QTX_DRX_PTR);
mtk_w32(eth, (QDMA_RES_THRES << 8) | QDMA_RES_THRES,
MTK_QTX_CFG(0));
} else {
--- a/drivers/net/ethernet/mediatek/mtk_eth_soc.h
+++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.h
@@ -656,6 +656,7 @@ struct mtk_tx_buf {
* @phys: The physical addr of tx_buf
* @next_free: Pointer to the next free descriptor
* @last_free: Pointer to the last free descriptor
+ * @last_free_ptr: Hardware pointer value of the last free descriptor
* @thresh: The threshold of minimum amount of free descriptors
* @free_count: QDMA uses a linked list. Track how many free descriptors
* are present
@@ -666,6 +667,7 @@ struct mtk_tx_ring {
dma_addr_t phys;
struct mtk_tx_dma *next_free;
struct mtk_tx_dma *last_free;
+ u32 last_free_ptr;
u16 thresh;
atomic_t free_count;
int dma_size;

View File

@ -1,49 +0,0 @@
From 816ac3e6e67bdd78d86226c6eb53619780750e92 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Thu, 22 Apr 2021 22:21:04 -0700
Subject: [PATCH] net: ethernet: mtk_eth_soc: only read the full RX descriptor
if DMA is done
Uncached memory access is expensive, and there is no need to access all
descriptor words if we can't process them anyway
Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Ilya Lipnitskiy <ilya.lipnitskiy@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
drivers/net/ethernet/mediatek/mtk_eth_soc.c | 12 ++++++++----
1 file changed, 8 insertions(+), 4 deletions(-)
--- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c
+++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
@@ -798,13 +798,18 @@ static inline int mtk_max_buf_size(int f
return buf_size;
}
-static inline void mtk_rx_get_desc(struct mtk_rx_dma *rxd,
+static inline bool mtk_rx_get_desc(struct mtk_rx_dma *rxd,
struct mtk_rx_dma *dma_rxd)
{
- rxd->rxd1 = READ_ONCE(dma_rxd->rxd1);
rxd->rxd2 = READ_ONCE(dma_rxd->rxd2);
+ if (!(rxd->rxd2 & RX_DMA_DONE))
+ return false;
+
+ rxd->rxd1 = READ_ONCE(dma_rxd->rxd1);
rxd->rxd3 = READ_ONCE(dma_rxd->rxd3);
rxd->rxd4 = READ_ONCE(dma_rxd->rxd4);
+
+ return true;
}
/* the qdma core needs scratch memory to be setup */
@@ -1276,8 +1281,7 @@ static int mtk_poll_rx(struct napi_struc
rxd = &ring->dma[idx];
data = ring->data[idx];
- mtk_rx_get_desc(&trxd, rxd);
- if (!(trxd.rxd2 & RX_DMA_DONE))
+ if (!mtk_rx_get_desc(&trxd, rxd))
break;
/* find out which mac the packet come from. values start at 1 */

View File

@ -1,39 +0,0 @@
From 16769a8923fad5a5377253bcd76b0e0d64976c73 Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Thu, 22 Apr 2021 22:21:05 -0700
Subject: [PATCH] net: ethernet: mtk_eth_soc: reduce unnecessary interrupts
Avoid rearming interrupt if napi_complete returns false
Signed-off-by: Felix Fietkau <nbd@nbd.name>
Signed-off-by: Ilya Lipnitskiy <ilya.lipnitskiy@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
drivers/net/ethernet/mediatek/mtk_eth_soc.c | 9 +++++----
1 file changed, 5 insertions(+), 4 deletions(-)
--- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c
+++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
@@ -1540,8 +1540,8 @@ static int mtk_napi_tx(struct napi_struc
if (status & MTK_TX_DONE_INT)
return budget;
- napi_complete(napi);
- mtk_tx_irq_enable(eth, MTK_TX_DONE_INT);
+ if (napi_complete(napi))
+ mtk_tx_irq_enable(eth, MTK_TX_DONE_INT);
return tx_done;
}
@@ -1574,8 +1574,9 @@ poll_again:
remain_budget -= rx_done;
goto poll_again;
}
- napi_complete(napi);
- mtk_rx_irq_enable(eth, MTK_RX_DONE_INT);
+
+ if (napi_complete(napi))
+ mtk_rx_irq_enable(eth, MTK_RX_DONE_INT);
return rx_done + budget - remain_budget;
}

View File

@ -1,110 +0,0 @@
From db2c7b353db3b3f71b55f9ff4627d8a786446fbe Mon Sep 17 00:00:00 2001
From: Ilya Lipnitskiy <ilya.lipnitskiy@gmail.com>
Date: Thu, 22 Apr 2021 22:21:06 -0700
Subject: [PATCH] net: ethernet: mtk_eth_soc: rework NAPI callbacks
Use napi_complete_done to communicate total TX and RX work done to NAPI.
Count total RX work up instead of remaining work down for clarity.
Remove unneeded local variables for clarity. Use do {} while instead of
goto for clarity.
Suggested-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Ilya Lipnitskiy <ilya.lipnitskiy@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
drivers/net/ethernet/mediatek/mtk_eth_soc.c | 54 +++++++++------------
1 file changed, 24 insertions(+), 30 deletions(-)
--- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c
+++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
@@ -1517,7 +1517,6 @@ static void mtk_handle_status_irq(struct
static int mtk_napi_tx(struct napi_struct *napi, int budget)
{
struct mtk_eth *eth = container_of(napi, struct mtk_eth, tx_napi);
- u32 status, mask;
int tx_done = 0;
if (MTK_HAS_CAPS(eth->soc->caps, MTK_QDMA))
@@ -1526,21 +1525,19 @@ static int mtk_napi_tx(struct napi_struc
tx_done = mtk_poll_tx(eth, budget);
if (unlikely(netif_msg_intr(eth))) {
- status = mtk_r32(eth, eth->tx_int_status_reg);
- mask = mtk_r32(eth, eth->tx_int_mask_reg);
dev_info(eth->dev,
- "done tx %d, intr 0x%08x/0x%x\n",
- tx_done, status, mask);
+ "done tx %d, intr 0x%08x/0x%x\n", tx_done,
+ mtk_r32(eth, eth->tx_int_status_reg),
+ mtk_r32(eth, eth->tx_int_mask_reg));
}
if (tx_done == budget)
return budget;
- status = mtk_r32(eth, eth->tx_int_status_reg);
- if (status & MTK_TX_DONE_INT)
+ if (mtk_r32(eth, eth->tx_int_status_reg) & MTK_TX_DONE_INT)
return budget;
- if (napi_complete(napi))
+ if (napi_complete_done(napi, tx_done))
mtk_tx_irq_enable(eth, MTK_TX_DONE_INT);
return tx_done;
@@ -1549,36 +1546,33 @@ static int mtk_napi_tx(struct napi_struc
static int mtk_napi_rx(struct napi_struct *napi, int budget)
{
struct mtk_eth *eth = container_of(napi, struct mtk_eth, rx_napi);
- u32 status, mask;
- int rx_done = 0;
- int remain_budget = budget;
+ int rx_done_total = 0;
mtk_handle_status_irq(eth);
-poll_again:
- mtk_w32(eth, MTK_RX_DONE_INT, MTK_PDMA_INT_STATUS);
- rx_done = mtk_poll_rx(napi, remain_budget, eth);
+ do {
+ int rx_done;
- if (unlikely(netif_msg_intr(eth))) {
- status = mtk_r32(eth, MTK_PDMA_INT_STATUS);
- mask = mtk_r32(eth, MTK_PDMA_INT_MASK);
- dev_info(eth->dev,
- "done rx %d, intr 0x%08x/0x%x\n",
- rx_done, status, mask);
- }
- if (rx_done == remain_budget)
- return budget;
+ mtk_w32(eth, MTK_RX_DONE_INT, MTK_PDMA_INT_STATUS);
+ rx_done = mtk_poll_rx(napi, budget - rx_done_total, eth);
+ rx_done_total += rx_done;
+
+ if (unlikely(netif_msg_intr(eth))) {
+ dev_info(eth->dev,
+ "done rx %d, intr 0x%08x/0x%x\n", rx_done,
+ mtk_r32(eth, MTK_PDMA_INT_STATUS),
+ mtk_r32(eth, MTK_PDMA_INT_MASK));
+ }
- status = mtk_r32(eth, MTK_PDMA_INT_STATUS);
- if (status & MTK_RX_DONE_INT) {
- remain_budget -= rx_done;
- goto poll_again;
- }
+ if (rx_done_total == budget)
+ return budget;
+
+ } while (mtk_r32(eth, MTK_PDMA_INT_STATUS) & MTK_RX_DONE_INT);
- if (napi_complete(napi))
+ if (napi_complete_done(napi, rx_done_total))
mtk_rx_irq_enable(eth, MTK_RX_DONE_INT);
- return rx_done + budget - remain_budget;
+ return rx_done_total;
}
static int mtk_tx_alloc(struct mtk_eth *eth)

View File

@ -1,47 +0,0 @@
From fa817272c37ef78e25dc14e4760ac78a7043a18a Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Thu, 22 Apr 2021 22:21:07 -0700
Subject: [PATCH] net: ethernet: mtk_eth_soc: set PPE flow hash as skb hash if
present
This improves GRO performance
Signed-off-by: Felix Fietkau <nbd@nbd.name>
[Ilya: Use MTK_RXD4_FOE_ENTRY instead of GENMASK(13, 0)]
Signed-off-by: Ilya Lipnitskiy <ilya.lipnitskiy@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
drivers/net/ethernet/mediatek/mtk_eth_soc.c | 8 ++++++++
1 file changed, 8 insertions(+)
--- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c
+++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
@@ -19,6 +19,7 @@
#include <linux/interrupt.h>
#include <linux/pinctrl/devinfo.h>
#include <linux/phylink.h>
+#include <linux/jhash.h>
#include <net/dsa.h>
#include "mtk_eth_soc.h"
@@ -1271,6 +1272,7 @@ static int mtk_poll_rx(struct napi_struc
struct net_device *netdev;
unsigned int pktlen;
dma_addr_t dma_addr;
+ u32 hash;
int mac;
ring = mtk_get_rx_ring(eth);
@@ -1340,6 +1342,12 @@ static int mtk_poll_rx(struct napi_struc
skb->protocol = eth_type_trans(skb, netdev);
bytes += pktlen;
+ hash = trxd.rxd4 & MTK_RXD4_FOE_ENTRY;
+ if (hash != MTK_RXD4_FOE_ENTRY) {
+ hash = jhash_1word(hash, 0);
+ skb_set_hash(skb, hash, PKT_HASH_TYPE_L4);
+ }
+
if (netdev->features & NETIF_F_HW_VLAN_CTAG_RX &&
(trxd.rxd2 & RX_DMA_VTAG))
__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q),

View File

@ -1,71 +0,0 @@
From 3bc8e0aff23be0526af0dbc7973a8866a08d73f1 Mon Sep 17 00:00:00 2001
From: Ilya Lipnitskiy <ilya.lipnitskiy@gmail.com>
Date: Thu, 22 Apr 2021 22:21:08 -0700
Subject: [PATCH] net: ethernet: mtk_eth_soc: use iopoll.h macro for DMA init
Replace a tight busy-wait loop without a pause with a standard
readx_poll_timeout_atomic routine with a 5 us poll period.
Tested by booting a MT7621 device to ensure the driver initializes
properly.
Signed-off-by: Ilya Lipnitskiy <ilya.lipnitskiy@gmail.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
drivers/net/ethernet/mediatek/mtk_eth_soc.c | 29 +++++++++------------
drivers/net/ethernet/mediatek/mtk_eth_soc.h | 2 +-
2 files changed, 14 insertions(+), 17 deletions(-)
--- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c
+++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
@@ -2054,25 +2054,22 @@ static int mtk_set_features(struct net_d
/* wait for DMA to finish whatever it is doing before we start using it again */
static int mtk_dma_busy_wait(struct mtk_eth *eth)
{
- unsigned long t_start = jiffies;
+ unsigned int reg;
+ int ret;
+ u32 val;
- while (1) {
- if (MTK_HAS_CAPS(eth->soc->caps, MTK_QDMA)) {
- if (!(mtk_r32(eth, MTK_QDMA_GLO_CFG) &
- (MTK_RX_DMA_BUSY | MTK_TX_DMA_BUSY)))
- return 0;
- } else {
- if (!(mtk_r32(eth, MTK_PDMA_GLO_CFG) &
- (MTK_RX_DMA_BUSY | MTK_TX_DMA_BUSY)))
- return 0;
- }
+ if (MTK_HAS_CAPS(eth->soc->caps, MTK_QDMA))
+ reg = MTK_QDMA_GLO_CFG;
+ else
+ reg = MTK_PDMA_GLO_CFG;
- if (time_after(jiffies, t_start + MTK_DMA_BUSY_TIMEOUT))
- break;
- }
+ ret = readx_poll_timeout_atomic(__raw_readl, eth->base + reg, val,
+ !(val & (MTK_RX_DMA_BUSY | MTK_TX_DMA_BUSY)),
+ 5, MTK_DMA_BUSY_TIMEOUT_US);
+ if (ret)
+ dev_err(eth->dev, "DMA init timeout\n");
- dev_err(eth->dev, "DMA init timeout\n");
- return -1;
+ return ret;
}
static int mtk_dma_init(struct mtk_eth *eth)
--- a/drivers/net/ethernet/mediatek/mtk_eth_soc.h
+++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.h
@@ -213,7 +213,7 @@
#define MTK_TX_DMA_BUSY BIT(1)
#define MTK_RX_DMA_EN BIT(2)
#define MTK_TX_DMA_EN BIT(0)
-#define MTK_DMA_BUSY_TIMEOUT HZ
+#define MTK_DMA_BUSY_TIMEOUT_US 1000000
/* QDMA Reset Index Register */
#define MTK_QDMA_RST_IDX 0x1A08

View File

@ -1,63 +0,0 @@
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Sun, 18 Apr 2021 23:11:44 +0200
Subject: [PATCH] net: ethernet: mtk_eth_soc: missing mutex
Patch 2ed37183abb7 ("netfilter: flowtable: separate replace, destroy and
stats to different workqueues") splits the workqueue per event type. Add
a mutex to serialize updates.
Fixes: 502e84e2382d ("net: ethernet: mtk_eth_soc: add flow offloading support")
Reported-by: Frank Wunderlich <frank-w@public-files.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
--- a/drivers/net/ethernet/mediatek/mtk_ppe_offload.c
+++ b/drivers/net/ethernet/mediatek/mtk_ppe_offload.c
@@ -392,6 +392,8 @@ mtk_flow_offload_stats(struct mtk_eth *e
return 0;
}
+static DEFINE_MUTEX(mtk_flow_offload_mutex);
+
static int
mtk_eth_setup_tc_block_cb(enum tc_setup_type type, void *type_data, void *cb_priv)
{
@@ -399,6 +401,7 @@ mtk_eth_setup_tc_block_cb(enum tc_setup_
struct net_device *dev = cb_priv;
struct mtk_mac *mac = netdev_priv(dev);
struct mtk_eth *eth = mac->hw;
+ int err;
if (!tc_can_offload(dev))
return -EOPNOTSUPP;
@@ -406,18 +409,24 @@ mtk_eth_setup_tc_block_cb(enum tc_setup_
if (type != TC_SETUP_CLSFLOWER)
return -EOPNOTSUPP;
+ mutex_lock(&mtk_flow_offload_mutex);
switch (cls->command) {
case FLOW_CLS_REPLACE:
- return mtk_flow_offload_replace(eth, cls);
+ err = mtk_flow_offload_replace(eth, cls);
+ break;
case FLOW_CLS_DESTROY:
- return mtk_flow_offload_destroy(eth, cls);
+ err = mtk_flow_offload_destroy(eth, cls);
+ break;
case FLOW_CLS_STATS:
- return mtk_flow_offload_stats(eth, cls);
+ err = mtk_flow_offload_stats(eth, cls);
+ break;
default:
- return -EOPNOTSUPP;
+ err = -EOPNOTSUPP;
+ break;
}
+ mutex_unlock(&mtk_flow_offload_mutex);
- return 0;
+ return err;
}
static int

View File

@ -1,22 +0,0 @@
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Sun, 18 Apr 2021 23:11:45 +0200
Subject: [PATCH] net: ethernet: mtk_eth_soc: handle VLAN pop action
Do not hit EOPNOTSUPP when flowtable offload provides a VLAN pop action.
Fixes: efce49dfe6a8 ("netfilter: flowtable: add vlan pop action offload support")
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
--- a/drivers/net/ethernet/mediatek/mtk_ppe_offload.c
+++ b/drivers/net/ethernet/mediatek/mtk_ppe_offload.c
@@ -233,6 +233,8 @@ mtk_flow_offload_replace(struct mtk_eth
data.vlan.proto = act->vlan.proto;
data.vlan.num++;
break;
+ case FLOW_ACTION_VLAN_POP:
+ break;
case FLOW_ACTION_PPPOE_PUSH:
if (data.pppoe.num == 1)
return -EOPNOTSUPP;

View File

@ -1,159 +0,0 @@
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Sun, 28 Mar 2021 23:08:55 +0200
Subject: [PATCH] netfilter: flowtable: dst_check() from garbage collector path
Move dst_check() to the garbage collector path. Stale routes trigger the
flow entry teardown state which makes affected flows go back to the
classic forwarding path to re-evaluate flow offloading.
IPv6 requires the dst cookie to work, store it in the flow_tuple,
otherwise dst_check() always fails.
Fixes: e5075c0badaa ("netfilter: flowtable: call dst_check() to fall back to classic forwarding")
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
--- a/include/net/netfilter/nf_flow_table.h
+++ b/include/net/netfilter/nf_flow_table.h
@@ -129,7 +129,10 @@ struct flow_offload_tuple {
in_vlan_ingress:2;
u16 mtu;
union {
- struct dst_entry *dst_cache;
+ struct {
+ struct dst_entry *dst_cache;
+ u32 dst_cookie;
+ };
struct {
u32 ifidx;
u32 hw_ifidx;
--- a/net/netfilter/nf_flow_table_core.c
+++ b/net/netfilter/nf_flow_table_core.c
@@ -74,6 +74,18 @@ err_ct_refcnt:
}
EXPORT_SYMBOL_GPL(flow_offload_alloc);
+static u32 flow_offload_dst_cookie(struct flow_offload_tuple *flow_tuple)
+{
+ const struct rt6_info *rt;
+
+ if (flow_tuple->l3proto == NFPROTO_IPV6) {
+ rt = (const struct rt6_info *)flow_tuple->dst_cache;
+ return rt6_get_cookie(rt);
+ }
+
+ return 0;
+}
+
static int flow_offload_fill_route(struct flow_offload *flow,
const struct nf_flow_route *route,
enum flow_offload_tuple_dir dir)
@@ -116,6 +128,7 @@ static int flow_offload_fill_route(struc
return -1;
flow_tuple->dst_cache = dst;
+ flow_tuple->dst_cookie = flow_offload_dst_cookie(flow_tuple);
break;
}
flow_tuple->xmit_type = route->tuple[dir].xmit_type;
@@ -389,11 +402,33 @@ nf_flow_table_iterate(struct nf_flowtabl
return err;
}
+static bool flow_offload_stale_dst(struct flow_offload_tuple *tuple)
+{
+ struct dst_entry *dst;
+
+ if (tuple->xmit_type == FLOW_OFFLOAD_XMIT_NEIGH ||
+ tuple->xmit_type == FLOW_OFFLOAD_XMIT_XFRM) {
+ dst = tuple->dst_cache;
+ if (!dst_check(dst, tuple->dst_cookie))
+ return true;
+ }
+
+ return false;
+}
+
+static bool nf_flow_has_stale_dst(struct flow_offload *flow)
+{
+ return flow_offload_stale_dst(&flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple) ||
+ flow_offload_stale_dst(&flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple);
+}
+
static void nf_flow_offload_gc_step(struct flow_offload *flow, void *data)
{
struct nf_flowtable *flow_table = data;
- if (nf_flow_has_expired(flow) || nf_ct_is_dying(flow->ct))
+ if (nf_flow_has_expired(flow) ||
+ nf_ct_is_dying(flow->ct) ||
+ nf_flow_has_stale_dst(flow))
set_bit(NF_FLOW_TEARDOWN, &flow->flags);
if (test_bit(NF_FLOW_TEARDOWN, &flow->flags)) {
--- a/net/netfilter/nf_flow_table_ip.c
+++ b/net/netfilter/nf_flow_table_ip.c
@@ -364,15 +364,6 @@ nf_flow_offload_ip_hook(void *priv, stru
if (nf_flow_state_check(flow, iph->protocol, skb, thoff))
return NF_ACCEPT;
- if (tuplehash->tuple.xmit_type == FLOW_OFFLOAD_XMIT_NEIGH ||
- tuplehash->tuple.xmit_type == FLOW_OFFLOAD_XMIT_XFRM) {
- rt = (struct rtable *)tuplehash->tuple.dst_cache;
- if (!dst_check(&rt->dst, 0)) {
- flow_offload_teardown(flow);
- return NF_ACCEPT;
- }
- }
-
if (skb_try_make_writable(skb, thoff + hdrsize))
return NF_DROP;
@@ -391,6 +382,7 @@ nf_flow_offload_ip_hook(void *priv, stru
nf_ct_acct_update(flow->ct, tuplehash->tuple.dir, skb->len);
if (unlikely(tuplehash->tuple.xmit_type == FLOW_OFFLOAD_XMIT_XFRM)) {
+ rt = (struct rtable *)tuplehash->tuple.dst_cache;
memset(skb->cb, 0, sizeof(struct inet_skb_parm));
IPCB(skb)->iif = skb->dev->ifindex;
IPCB(skb)->flags = IPSKB_FORWARDED;
@@ -399,6 +391,7 @@ nf_flow_offload_ip_hook(void *priv, stru
switch (tuplehash->tuple.xmit_type) {
case FLOW_OFFLOAD_XMIT_NEIGH:
+ rt = (struct rtable *)tuplehash->tuple.dst_cache;
outdev = rt->dst.dev;
skb->dev = outdev;
nexthop = rt_nexthop(rt, flow->tuplehash[!dir].tuple.src_v4.s_addr);
@@ -607,15 +600,6 @@ nf_flow_offload_ipv6_hook(void *priv, st
if (nf_flow_state_check(flow, ip6h->nexthdr, skb, thoff))
return NF_ACCEPT;
- if (tuplehash->tuple.xmit_type == FLOW_OFFLOAD_XMIT_NEIGH ||
- tuplehash->tuple.xmit_type == FLOW_OFFLOAD_XMIT_XFRM) {
- rt = (struct rt6_info *)tuplehash->tuple.dst_cache;
- if (!dst_check(&rt->dst, 0)) {
- flow_offload_teardown(flow);
- return NF_ACCEPT;
- }
- }
-
if (skb_try_make_writable(skb, thoff + hdrsize))
return NF_DROP;
@@ -633,6 +617,7 @@ nf_flow_offload_ipv6_hook(void *priv, st
nf_ct_acct_update(flow->ct, tuplehash->tuple.dir, skb->len);
if (unlikely(tuplehash->tuple.xmit_type == FLOW_OFFLOAD_XMIT_XFRM)) {
+ rt = (struct rt6_info *)tuplehash->tuple.dst_cache;
memset(skb->cb, 0, sizeof(struct inet6_skb_parm));
IP6CB(skb)->iif = skb->dev->ifindex;
IP6CB(skb)->flags = IP6SKB_FORWARDED;
@@ -641,6 +626,7 @@ nf_flow_offload_ipv6_hook(void *priv, st
switch (tuplehash->tuple.xmit_type) {
case FLOW_OFFLOAD_XMIT_NEIGH:
+ rt = (struct rt6_info *)tuplehash->tuple.dst_cache;
outdev = rt->dst.dev;
skb->dev = outdev;
nexthop = rt6_nexthop(rt, &flow->tuplehash[!dir].tuple.src_v6);

View File

@ -1,94 +0,0 @@
From: Oz Shlomo <ozsh@nvidia.com>
Date: Thu, 3 Jun 2021 15:12:33 +0300
Subject: [PATCH] netfilter: conntrack: Introduce tcp offload timeout
configuration
TCP connections may be offloaded from nf conntrack to nf flow table.
Offloaded connections are aged after 30 seconds of inactivity.
Once aged, ownership is returned to conntrack with a hard coded pickup
time of 120 seconds, after which the connection may be deleted.
eted. The current aging intervals may be too aggressive for some users.
Provide users with the ability to control the nf flow table offload
aging and pickup time intervals via sysctl parameter as a pre-step for
configuring the nf flow table GC timeout intervals.
Signed-off-by: Oz Shlomo <ozsh@nvidia.com>
Reviewed-by: Paul Blakey <paulb@nvidia.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
--- a/include/net/netns/conntrack.h
+++ b/include/net/netns/conntrack.h
@@ -27,6 +27,10 @@ struct nf_tcp_net {
int tcp_loose;
int tcp_be_liberal;
int tcp_max_retrans;
+#if IS_ENABLED(CONFIG_NF_FLOW_TABLE)
+ unsigned int offload_timeout;
+ unsigned int offload_pickup;
+#endif
};
enum udp_conntrack {
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -1438,6 +1438,11 @@ void nf_conntrack_tcp_init_net(struct ne
tn->tcp_loose = nf_ct_tcp_loose;
tn->tcp_be_liberal = nf_ct_tcp_be_liberal;
tn->tcp_max_retrans = nf_ct_tcp_max_retrans;
+
+#if IS_ENABLED(CONFIG_NF_FLOW_TABLE)
+ tn->offload_timeout = 30 * HZ;
+ tn->offload_pickup = 120 * HZ;
+#endif
}
const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp =
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -567,6 +567,10 @@ enum nf_ct_sysctl_index {
NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_CLOSE,
NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_RETRANS,
NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_UNACK,
+#if IS_ENABLED(CONFIG_NF_FLOW_TABLE)
+ NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_OFFLOAD,
+ NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_OFFLOAD_PICKUP,
+#endif
NF_SYSCTL_CT_PROTO_TCP_LOOSE,
NF_SYSCTL_CT_PROTO_TCP_LIBERAL,
NF_SYSCTL_CT_PROTO_TCP_MAX_RETRANS,
@@ -758,6 +762,20 @@ static struct ctl_table nf_ct_sysctl_tab
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
+#if IS_ENABLED(CONFIG_NF_FLOW_TABLE)
+ [NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_OFFLOAD] = {
+ .procname = "nf_flowtable_tcp_timeout",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ [NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_OFFLOAD_PICKUP] = {
+ .procname = "nf_flowtable_tcp_pickup",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+#endif
[NF_SYSCTL_CT_PROTO_TCP_LOOSE] = {
.procname = "nf_conntrack_tcp_loose",
.maxlen = sizeof(int),
@@ -967,6 +985,12 @@ static void nf_conntrack_standalone_init
XASSIGN(LIBERAL, &tn->tcp_be_liberal);
XASSIGN(MAX_RETRANS, &tn->tcp_max_retrans);
#undef XASSIGN
+
+#if IS_ENABLED(CONFIG_NF_FLOW_TABLE)
+ table[NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_OFFLOAD].data = &tn->offload_timeout;
+ table[NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_OFFLOAD_PICKUP].data = &tn->offload_pickup;
+#endif
+
}
static void nf_conntrack_standalone_init_sctp_sysctl(struct net *net,

View File

@ -1,92 +0,0 @@
From: Oz Shlomo <ozsh@nvidia.com>
Date: Thu, 3 Jun 2021 15:12:34 +0300
Subject: [PATCH] netfilter: conntrack: Introduce udp offload timeout
configuration
UDP connections may be offloaded from nf conntrack to nf flow table.
Offloaded connections are aged after 30 seconds of inactivity.
Once aged, ownership is returned to conntrack with a hard coded pickup
time of 30 seconds, after which the connection may be deleted.
eted. The current aging intervals may be too aggressive for some users.
Provide users with the ability to control the nf flow table offload
aging and pickup time intervals via sysctl parameter as a pre-step for
configuring the nf flow table GC timeout intervals.
Signed-off-by: Oz Shlomo <ozsh@nvidia.com>
Reviewed-by: Paul Blakey <paulb@nvidia.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
--- a/include/net/netns/conntrack.h
+++ b/include/net/netns/conntrack.h
@@ -41,6 +41,10 @@ enum udp_conntrack {
struct nf_udp_net {
unsigned int timeouts[UDP_CT_MAX];
+#if IS_ENABLED(CONFIG_NF_FLOW_TABLE)
+ unsigned int offload_timeout;
+ unsigned int offload_pickup;
+#endif
};
struct nf_icmp_net {
--- a/net/netfilter/nf_conntrack_proto_udp.c
+++ b/net/netfilter/nf_conntrack_proto_udp.c
@@ -273,6 +273,11 @@ void nf_conntrack_udp_init_net(struct ne
for (i = 0; i < UDP_CT_MAX; i++)
un->timeouts[i] = udp_timeouts[i];
+
+#if IS_ENABLED(CONFIG_NF_FLOW_TABLE)
+ un->offload_timeout = 30 * HZ;
+ un->offload_pickup = 30 * HZ;
+#endif
}
const struct nf_conntrack_l4proto nf_conntrack_l4proto_udp =
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -576,6 +576,10 @@ enum nf_ct_sysctl_index {
NF_SYSCTL_CT_PROTO_TCP_MAX_RETRANS,
NF_SYSCTL_CT_PROTO_TIMEOUT_UDP,
NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_STREAM,
+#if IS_ENABLED(CONFIG_NF_FLOW_TABLE)
+ NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_OFFLOAD,
+ NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_OFFLOAD_PICKUP,
+#endif
NF_SYSCTL_CT_PROTO_TIMEOUT_ICMP,
NF_SYSCTL_CT_PROTO_TIMEOUT_ICMPV6,
#ifdef CONFIG_NF_CT_PROTO_SCTP
@@ -810,6 +814,20 @@ static struct ctl_table nf_ct_sysctl_tab
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
+#if IS_ENABLED(CONFIG_NFT_FLOW_OFFLOAD)
+ [NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_OFFLOAD] = {
+ .procname = "nf_flowtable_udp_timeout",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ [NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_OFFLOAD_PICKUP] = {
+ .procname = "nf_flowtable_udp_pickup",
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+#endif
[NF_SYSCTL_CT_PROTO_TIMEOUT_ICMP] = {
.procname = "nf_conntrack_icmp_timeout",
.maxlen = sizeof(unsigned int),
@@ -1078,6 +1096,10 @@ static int nf_conntrack_standalone_init_
table[NF_SYSCTL_CT_PROTO_TIMEOUT_ICMPV6].data = &nf_icmpv6_pernet(net)->timeout;
table[NF_SYSCTL_CT_PROTO_TIMEOUT_UDP].data = &un->timeouts[UDP_CT_UNREPLIED];
table[NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_STREAM].data = &un->timeouts[UDP_CT_REPLIED];
+#if IS_ENABLED(CONFIG_NF_FLOW_TABLE)
+ table[NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_OFFLOAD].data = &un->offload_timeout;
+ table[NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_OFFLOAD_PICKUP].data = &un->offload_pickup;
+#endif
nf_conntrack_standalone_init_tcp_sysctl(net, table);
nf_conntrack_standalone_init_sctp_sysctl(net, table);

View File

@ -1,134 +0,0 @@
From: Oz Shlomo <ozsh@nvidia.com>
Date: Thu, 3 Jun 2021 15:12:35 +0300
Subject: [PATCH] netfilter: flowtable: Set offload timeouts according to proto
values
Currently the aging period for tcp/udp connections is hard coded to
30 seconds. Aged tcp/udp connections configure a hard coded 120/30
seconds pickup timeout for conntrack.
This configuration may be too aggressive or permissive for some users.
Dynamically configure the nf flow table GC timeout intervals according
to the user defined values.
Signed-off-by: Oz Shlomo <ozsh@nvidia.com>
Reviewed-by: Paul Blakey <paulb@nvidia.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
--- a/include/net/netfilter/nf_flow_table.h
+++ b/include/net/netfilter/nf_flow_table.h
@@ -174,6 +174,8 @@ struct flow_offload {
#define NF_FLOW_TIMEOUT (30 * HZ)
#define nf_flowtable_time_stamp (u32)jiffies
+unsigned long flow_offload_get_timeout(struct flow_offload *flow);
+
static inline __s32 nf_flow_timeout_delta(unsigned int timeout)
{
return (__s32)(timeout - nf_flowtable_time_stamp);
--- a/net/netfilter/nf_flow_table_core.c
+++ b/net/netfilter/nf_flow_table_core.c
@@ -175,12 +175,10 @@ static void flow_offload_fixup_tcp(struc
tcp->seen[1].td_maxwin = 0;
}
-#define NF_FLOWTABLE_TCP_PICKUP_TIMEOUT (120 * HZ)
-#define NF_FLOWTABLE_UDP_PICKUP_TIMEOUT (30 * HZ)
-
static void flow_offload_fixup_ct_timeout(struct nf_conn *ct)
{
const struct nf_conntrack_l4proto *l4proto;
+ struct net *net = nf_ct_net(ct);
int l4num = nf_ct_protonum(ct);
unsigned int timeout;
@@ -188,12 +186,17 @@ static void flow_offload_fixup_ct_timeou
if (!l4proto)
return;
- if (l4num == IPPROTO_TCP)
- timeout = NF_FLOWTABLE_TCP_PICKUP_TIMEOUT;
- else if (l4num == IPPROTO_UDP)
- timeout = NF_FLOWTABLE_UDP_PICKUP_TIMEOUT;
- else
+ if (l4num == IPPROTO_TCP) {
+ struct nf_tcp_net *tn = nf_tcp_pernet(net);
+
+ timeout = tn->offload_pickup;
+ } else if (l4num == IPPROTO_UDP) {
+ struct nf_udp_net *tn = nf_udp_pernet(net);
+
+ timeout = tn->offload_pickup;
+ } else {
return;
+ }
if (nf_flow_timeout_delta(READ_ONCE(ct->timeout)) > (__s32)timeout)
WRITE_ONCE(ct->timeout, nfct_time_stamp + timeout);
@@ -265,11 +268,35 @@ static const struct rhashtable_params nf
.automatic_shrinking = true,
};
+unsigned long flow_offload_get_timeout(struct flow_offload *flow)
+{
+ const struct nf_conntrack_l4proto *l4proto;
+ unsigned long timeout = NF_FLOW_TIMEOUT;
+ struct net *net = nf_ct_net(flow->ct);
+ int l4num = nf_ct_protonum(flow->ct);
+
+ l4proto = nf_ct_l4proto_find(l4num);
+ if (!l4proto)
+ return timeout;
+
+ if (l4num == IPPROTO_TCP) {
+ struct nf_tcp_net *tn = nf_tcp_pernet(net);
+
+ timeout = tn->offload_timeout;
+ } else if (l4num == IPPROTO_UDP) {
+ struct nf_udp_net *tn = nf_udp_pernet(net);
+
+ timeout = tn->offload_timeout;
+ }
+
+ return timeout;
+}
+
int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow)
{
int err;
- flow->timeout = nf_flowtable_time_stamp + NF_FLOW_TIMEOUT;
+ flow->timeout = nf_flowtable_time_stamp + flow_offload_get_timeout(flow);
err = rhashtable_insert_fast(&flow_table->rhashtable,
&flow->tuplehash[0].node,
@@ -301,7 +328,7 @@ EXPORT_SYMBOL_GPL(flow_offload_add);
void flow_offload_refresh(struct nf_flowtable *flow_table,
struct flow_offload *flow)
{
- flow->timeout = nf_flowtable_time_stamp + NF_FLOW_TIMEOUT;
+ flow->timeout = nf_flowtable_time_stamp + flow_offload_get_timeout(flow);
if (likely(!nf_flowtable_hw_offload(flow_table)))
return;
--- a/net/netfilter/nf_flow_table_offload.c
+++ b/net/netfilter/nf_flow_table_offload.c
@@ -885,7 +885,7 @@ static void flow_offload_work_stats(stru
lastused = max_t(u64, stats[0].lastused, stats[1].lastused);
offload->flow->timeout = max_t(u64, offload->flow->timeout,
- lastused + NF_FLOW_TIMEOUT);
+ lastused + flow_offload_get_timeout(offload->flow));
if (offload->flowtable->flags & NF_FLOWTABLE_COUNTER) {
if (stats[0].pkts)
@@ -989,7 +989,7 @@ void nf_flow_offload_stats(struct nf_flo
__s32 delta;
delta = nf_flow_timeout_delta(flow->timeout);
- if ((delta >= (9 * NF_FLOW_TIMEOUT) / 10))
+ if ((delta >= (9 * flow_offload_get_timeout(flow)) / 10))
return;
offload = nf_flow_offload_work_alloc(flowtable, flow, FLOW_CLS_STATS);

View File

@ -1,138 +0,0 @@
From 4fd59792097a6b2fb949d41264386a7ecade469e Mon Sep 17 00:00:00 2001
From: DENG Qingfang <dqfext@gmail.com>
Date: Mon, 25 Jan 2021 12:20:46 +0800
Subject: [PATCH] net: ethernet: mediatek: support setting MTU
MT762x HW, except for MT7628, supports frame length up to 2048
(maximum length on GDM), so allow setting MTU up to 2030.
Also set the default frame length to the hardware default 1518.
Signed-off-by: DENG Qingfang <dqfext@gmail.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Link: https://lore.kernel.org/r/20210125042046.5599-1-dqfext@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
drivers/net/ethernet/mediatek/mtk_eth_soc.c | 43 ++++++++++++++++++---
drivers/net/ethernet/mediatek/mtk_eth_soc.h | 12 ++++--
2 files changed, 47 insertions(+), 8 deletions(-)
--- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c
+++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
@@ -355,7 +355,7 @@ static void mtk_mac_config(struct phylin
/* Setup gmac */
mcr_cur = mtk_r32(mac->hw, MTK_MAC_MCR(mac->id));
mcr_new = mcr_cur;
- mcr_new |= MAC_MCR_MAX_RX_1536 | MAC_MCR_IPG_CFG | MAC_MCR_FORCE_MODE |
+ mcr_new |= MAC_MCR_IPG_CFG | MAC_MCR_FORCE_MODE |
MAC_MCR_BACKOFF_EN | MAC_MCR_BACKPR_EN | MAC_MCR_FORCE_LINK;
/* Only update control register when needed! */
@@ -782,8 +782,8 @@ static void mtk_get_stats64(struct net_d
static inline int mtk_max_frag_size(int mtu)
{
/* make sure buf_size will be at least MTK_MAX_RX_LENGTH */
- if (mtu + MTK_RX_ETH_HLEN < MTK_MAX_RX_LENGTH)
- mtu = MTK_MAX_RX_LENGTH - MTK_RX_ETH_HLEN;
+ if (mtu + MTK_RX_ETH_HLEN < MTK_MAX_RX_LENGTH_2K)
+ mtu = MTK_MAX_RX_LENGTH_2K - MTK_RX_ETH_HLEN;
return SKB_DATA_ALIGN(MTK_RX_HLEN + mtu) +
SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
@@ -794,7 +794,7 @@ static inline int mtk_max_buf_size(int f
int buf_size = frag_size - NET_SKB_PAD - NET_IP_ALIGN -
SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
- WARN_ON(buf_size < MTK_MAX_RX_LENGTH);
+ WARN_ON(buf_size < MTK_MAX_RX_LENGTH_2K);
return buf_size;
}
@@ -2606,6 +2606,35 @@ static void mtk_uninit(struct net_device
mtk_rx_irq_disable(eth, ~0);
}
+static int mtk_change_mtu(struct net_device *dev, int new_mtu)
+{
+ int length = new_mtu + MTK_RX_ETH_HLEN;
+ struct mtk_mac *mac = netdev_priv(dev);
+ struct mtk_eth *eth = mac->hw;
+ u32 mcr_cur, mcr_new;
+
+ if (!MTK_HAS_CAPS(eth->soc->caps, MTK_SOC_MT7628)) {
+ mcr_cur = mtk_r32(mac->hw, MTK_MAC_MCR(mac->id));
+ mcr_new = mcr_cur & ~MAC_MCR_MAX_RX_MASK;
+
+ if (length <= 1518)
+ mcr_new |= MAC_MCR_MAX_RX(MAC_MCR_MAX_RX_1518);
+ else if (length <= 1536)
+ mcr_new |= MAC_MCR_MAX_RX(MAC_MCR_MAX_RX_1536);
+ else if (length <= 1552)
+ mcr_new |= MAC_MCR_MAX_RX(MAC_MCR_MAX_RX_1552);
+ else
+ mcr_new |= MAC_MCR_MAX_RX(MAC_MCR_MAX_RX_2048);
+
+ if (mcr_new != mcr_cur)
+ mtk_w32(mac->hw, mcr_new, MTK_MAC_MCR(mac->id));
+ }
+
+ dev->mtu = new_mtu;
+
+ return 0;
+}
+
static int mtk_do_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
{
struct mtk_mac *mac = netdev_priv(dev);
@@ -2902,6 +2931,7 @@ static const struct net_device_ops mtk_n
.ndo_set_mac_address = mtk_set_mac_address,
.ndo_validate_addr = eth_validate_addr,
.ndo_do_ioctl = mtk_do_ioctl,
+ .ndo_change_mtu = mtk_change_mtu,
.ndo_tx_timeout = mtk_tx_timeout,
.ndo_get_stats64 = mtk_get_stats64,
.ndo_fix_features = mtk_fix_features,
@@ -3004,7 +3034,10 @@ static int mtk_add_mac(struct mtk_eth *e
eth->netdev[id]->irq = eth->irq[0];
eth->netdev[id]->dev.of_node = np;
- eth->netdev[id]->max_mtu = MTK_MAX_RX_LENGTH - MTK_RX_ETH_HLEN;
+ if (MTK_HAS_CAPS(eth->soc->caps, MTK_SOC_MT7628))
+ eth->netdev[id]->max_mtu = MTK_MAX_RX_LENGTH - MTK_RX_ETH_HLEN;
+ else
+ eth->netdev[id]->max_mtu = MTK_MAX_RX_LENGTH_2K - MTK_RX_ETH_HLEN;
return 0;
--- a/drivers/net/ethernet/mediatek/mtk_eth_soc.h
+++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.h
@@ -20,12 +20,13 @@
#include "mtk_ppe.h"
#define MTK_QDMA_PAGE_SIZE 2048
-#define MTK_MAX_RX_LENGTH 1536
+#define MTK_MAX_RX_LENGTH 1536
+#define MTK_MAX_RX_LENGTH_2K 2048
#define MTK_TX_DMA_BUF_LEN 0x3fff
#define MTK_DMA_SIZE 512
#define MTK_NAPI_WEIGHT 64
#define MTK_MAC_COUNT 2
-#define MTK_RX_ETH_HLEN (VLAN_ETH_HLEN + VLAN_HLEN + ETH_FCS_LEN)
+#define MTK_RX_ETH_HLEN (ETH_HLEN + ETH_FCS_LEN)
#define MTK_RX_HLEN (NET_SKB_PAD + MTK_RX_ETH_HLEN + NET_IP_ALIGN)
#define MTK_DMA_DUMMY_DESC 0xffffffff
#define MTK_DEFAULT_MSG_ENABLE (NETIF_MSG_DRV | \
@@ -352,7 +353,12 @@
/* Mac control registers */
#define MTK_MAC_MCR(x) (0x10100 + (x * 0x100))
-#define MAC_MCR_MAX_RX_1536 BIT(24)
+#define MAC_MCR_MAX_RX_MASK GENMASK(25, 24)
+#define MAC_MCR_MAX_RX(_x) (MAC_MCR_MAX_RX_MASK & ((_x) << 24))
+#define MAC_MCR_MAX_RX_1518 0x0
+#define MAC_MCR_MAX_RX_1536 0x1
+#define MAC_MCR_MAX_RX_1552 0x2
+#define MAC_MCR_MAX_RX_2048 0x3
#define MAC_MCR_IPG_CFG (BIT(18) | BIT(16))
#define MAC_MCR_FORCE_MODE BIT(15)
#define MAC_MCR_TX_EN BIT(14)

View File

@ -1,108 +0,0 @@
From c329e5afb42ff0a88285eb4d8a391a18793e4777 Mon Sep 17 00:00:00 2001
From: David Bauer <mail@david-bauer.net>
Date: Thu, 15 Apr 2021 03:26:50 +0200
Subject: [PATCH] net: phy: at803x: select correct page on config init
The Atheros AR8031 and AR8033 expose different registers for SGMII/Fiber
as well as the copper side of the PHY depending on the BT_BX_REG_SEL bit
in the chip configure register.
The driver assumes the copper side is selected on probe, but this might
not be the case depending which page was last selected by the
bootloader. Notably, Ubiquiti UniFi bootloaders show this behavior.
Select the copper page when probing to circumvent this.
Signed-off-by: David Bauer <mail@david-bauer.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
drivers/net/phy/at803x.c | 50 +++++++++++++++++++++++++++++++++++++++-
1 file changed, 49 insertions(+), 1 deletion(-)
--- a/drivers/net/phy/at803x.c
+++ b/drivers/net/phy/at803x.c
@@ -139,6 +139,9 @@
#define ATH8035_PHY_ID 0x004dd072
#define AT8030_PHY_ID_MASK 0xffffffef
+#define AT803X_PAGE_FIBER 0
+#define AT803X_PAGE_COPPER 1
+
MODULE_DESCRIPTION("Qualcomm Atheros AR803x PHY driver");
MODULE_AUTHOR("Matus Ujhelyi");
MODULE_LICENSE("GPL");
@@ -190,6 +193,35 @@ static int at803x_debug_reg_mask(struct
return phy_write(phydev, AT803X_DEBUG_DATA, val);
}
+static int at803x_write_page(struct phy_device *phydev, int page)
+{
+ int mask;
+ int set;
+
+ if (page == AT803X_PAGE_COPPER) {
+ set = AT803X_BT_BX_REG_SEL;
+ mask = 0;
+ } else {
+ set = 0;
+ mask = AT803X_BT_BX_REG_SEL;
+ }
+
+ return __phy_modify(phydev, AT803X_REG_CHIP_CONFIG, mask, set);
+}
+
+static int at803x_read_page(struct phy_device *phydev)
+{
+ int ccr = __phy_read(phydev, AT803X_REG_CHIP_CONFIG);
+
+ if (ccr < 0)
+ return ccr;
+
+ if (ccr & AT803X_BT_BX_REG_SEL)
+ return AT803X_PAGE_COPPER;
+
+ return AT803X_PAGE_FIBER;
+}
+
static int at803x_enable_rx_delay(struct phy_device *phydev)
{
return at803x_debug_reg_mask(phydev, AT803X_DEBUG_REG_0, 0,
@@ -508,6 +540,7 @@ static int at803x_probe(struct phy_devic
{
struct device *dev = &phydev->mdio.dev;
struct at803x_priv *priv;
+ int ret;
priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL);
if (!priv)
@@ -515,7 +548,20 @@ static int at803x_probe(struct phy_devic
phydev->priv = priv;
- return at803x_parse_dt(phydev);
+ ret = at803x_parse_dt(phydev);
+ if (ret)
+ return ret;
+
+ /* Some bootloaders leave the fiber page selected.
+ * Switch to the copper page, as otherwise we read
+ * the PHY capabilities from the fiber side.
+ */
+ if (at803x_match_phy_id(phydev, ATH8031_PHY_ID)) {
+ ret = phy_select_page(phydev, AT803X_PAGE_COPPER);
+ ret = phy_restore_page(phydev, AT803X_PAGE_COPPER, ret);
+ }
+
+ return ret;
}
static void at803x_remove(struct phy_device *phydev)
@@ -1097,6 +1143,8 @@ static struct phy_driver at803x_driver[]
.get_wol = at803x_get_wol,
.suspend = at803x_suspend,
.resume = at803x_resume,
+ .read_page = at803x_read_page,
+ .write_page = at803x_write_page,
/* PHY_GBIT_FEATURES */
.read_status = at803x_read_status,
.aneg_done = at803x_aneg_done,

View File

@ -1,73 +0,0 @@
From 8f7e876273e294b732b42af2e5e6bba91d798954 Mon Sep 17 00:00:00 2001
From: Michael Walle <michael@walle.cc>
Date: Tue, 20 Apr 2021 12:29:29 +0200
Subject: [PATCH] net: phy: at803x: fix probe error if copper page is selected
The commit c329e5afb42f ("net: phy: at803x: select correct page on
config init") selects the copper page during probe. This fails if the
copper page was already selected. In this case, the value of the copper
page (which is 1) is propagated through phy_restore_page() and is
finally returned for at803x_probe(). Fix it, by just using the
at803x_page_write() directly.
Also in case of an error, the regulator is not disabled and leads to a
WARN_ON() when the probe fails. This couldn't happen before, because
at803x_parse_dt() was the last call in at803x_probe(). It is hard to
see, that the parse_dt() actually enables the regulator. Thus move the
regulator_enable() to the probe function and undo it in case of an
error.
Fixes: c329e5afb42f ("net: phy: at803x: select correct page on config init")
Signed-off-by: Michael Walle <michael@walle.cc>
Reviewed-by: David Bauer <mail@david-bauer.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
drivers/net/phy/at803x.c | 23 +++++++++++++++++------
1 file changed, 17 insertions(+), 6 deletions(-)
--- a/drivers/net/phy/at803x.c
+++ b/drivers/net/phy/at803x.c
@@ -527,10 +527,6 @@ static int at803x_parse_dt(struct phy_de
phydev_err(phydev, "failed to get VDDIO regulator\n");
return PTR_ERR(priv->vddio);
}
-
- ret = regulator_enable(priv->vddio);
- if (ret < 0)
- return ret;
}
return 0;
@@ -552,15 +548,30 @@ static int at803x_probe(struct phy_devic
if (ret)
return ret;
+ if (priv->vddio) {
+ ret = regulator_enable(priv->vddio);
+ if (ret < 0)
+ return ret;
+ }
+
/* Some bootloaders leave the fiber page selected.
* Switch to the copper page, as otherwise we read
* the PHY capabilities from the fiber side.
*/
if (at803x_match_phy_id(phydev, ATH8031_PHY_ID)) {
- ret = phy_select_page(phydev, AT803X_PAGE_COPPER);
- ret = phy_restore_page(phydev, AT803X_PAGE_COPPER, ret);
+ phy_lock_mdio_bus(phydev);
+ ret = at803x_write_page(phydev, AT803X_PAGE_COPPER);
+ phy_unlock_mdio_bus(phydev);
+ if (ret)
+ goto err;
}
+ return 0;
+
+err:
+ if (priv->vddio)
+ regulator_disable(priv->vddio);
+
return ret;
}

View File

@ -1,56 +0,0 @@
From b1ae3587d16a8c8fc9453e147c8708d6f006ffbb Mon Sep 17 00:00:00 2001
From: Bjarni Jonasson <bjarni.jonasson@microchip.com>
Date: Wed, 13 Jan 2021 12:56:25 +0100
Subject: [PATCH] net: phy: Add 100 base-x mode
Sparx-5 supports this mode and it is missing in the PHY core.
Signed-off-by: Bjarni Jonasson <bjarni.jonasson@microchip.com>
Reviewed-by: Russell King <rmk+kernel@armlinux.org.uk>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
Documentation/networking/phy.rst | 5 +++++
include/linux/phy.h | 4 ++++
2 files changed, 9 insertions(+)
--- a/Documentation/networking/phy.rst
+++ b/Documentation/networking/phy.rst
@@ -286,6 +286,11 @@ Some of the interface modes are describe
Note: due to legacy usage, some 10GBASE-R usage incorrectly makes
use of this definition.
+``PHY_INTERFACE_MODE_100BASEX``
+ This defines IEEE 802.3 Clause 24. The link operates at a fixed data
+ rate of 125Mpbs using a 4B/5B encoding scheme, resulting in an underlying
+ data rate of 100Mpbs.
+
Pause frames / flow control
===========================
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -104,6 +104,7 @@ extern const int phy_10gbit_features_arr
* @PHY_INTERFACE_MODE_MOCA: Multimedia over Coax
* @PHY_INTERFACE_MODE_QSGMII: Quad SGMII
* @PHY_INTERFACE_MODE_TRGMII: Turbo RGMII
+ * @PHY_INTERFACE_MODE_100BASEX: 100 BaseX
* @PHY_INTERFACE_MODE_1000BASEX: 1000 BaseX
* @PHY_INTERFACE_MODE_2500BASEX: 2500 BaseX
* @PHY_INTERFACE_MODE_RXAUI: Reduced XAUI
@@ -135,6 +136,7 @@ typedef enum {
PHY_INTERFACE_MODE_MOCA,
PHY_INTERFACE_MODE_QSGMII,
PHY_INTERFACE_MODE_TRGMII,
+ PHY_INTERFACE_MODE_100BASEX,
PHY_INTERFACE_MODE_1000BASEX,
PHY_INTERFACE_MODE_2500BASEX,
PHY_INTERFACE_MODE_RXAUI,
@@ -217,6 +219,8 @@ static inline const char *phy_modes(phy_
return "usxgmii";
case PHY_INTERFACE_MODE_10GKR:
return "10gbase-kr";
+ case PHY_INTERFACE_MODE_100BASEX:
+ return "100base-x";
default:
return "unknown";
}

View File

@ -1,40 +0,0 @@
From 6e12f35cef6b8a458d7ecf507ae330e0bffaad8c Mon Sep 17 00:00:00 2001
From: Bjarni Jonasson <bjarni.jonasson@microchip.com>
Date: Wed, 13 Jan 2021 12:56:26 +0100
Subject: [PATCH] sfp: add support for 100 base-x SFPs
Add support for 100Base-FX, 100Base-LX, 100Base-PX and 100Base-BX10 modules
This is needed for Sparx-5 switch.
Signed-off-by: Bjarni Jonasson <bjarni.jonasson@microchip.com>
Reviewed-by: Russell King <rmk+kernel@armlinux.org.uk>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
drivers/net/phy/sfp-bus.c | 9 +++++++++
1 file changed, 9 insertions(+)
--- a/drivers/net/phy/sfp-bus.c
+++ b/drivers/net/phy/sfp-bus.c
@@ -280,6 +280,12 @@ void sfp_parse_support(struct sfp_bus *b
br_min <= 1300 && br_max >= 1200)
phylink_set(modes, 1000baseX_Full);
+ /* 100Base-FX, 100Base-LX, 100Base-PX, 100Base-BX10 */
+ if (id->base.e100_base_fx || id->base.e100_base_lx)
+ phylink_set(modes, 100baseFX_Full);
+ if ((id->base.e_base_px || id->base.e_base_bx10) && br_nom == 100)
+ phylink_set(modes, 100baseFX_Full);
+
/* For active or passive cables, select the link modes
* based on the bit rates and the cable compliance bytes.
*/
@@ -399,6 +405,9 @@ phy_interface_t sfp_select_interface(str
if (phylink_test(link_modes, 1000baseX_Full))
return PHY_INTERFACE_MODE_1000BASEX;
+ if (phylink_test(link_modes, 100baseFX_Full))
+ return PHY_INTERFACE_MODE_100BASEX;
+
dev_warn(bus->sfp_dev, "Unable to ascertain link mode\n");
return PHY_INTERFACE_MODE_NA;

View File

@ -1,549 +0,0 @@
From 41d26bf4aba070dfd2ab48866cc27a48ee6228c7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Beh=C3=BAn?= <kabel@kernel.org>
Date: Tue, 20 Apr 2021 09:53:59 +0200
Subject: [PATCH] net: phy: marvell: refactor HWMON OOP style
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Use a structure of Marvell PHY specific HWMON methods to reduce code
duplication. Store a pointer to this structure into the PHY driver's
driver_data member.
Signed-off-by: Marek Behún <kabel@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
drivers/net/phy/marvell.c | 369 +++++++++++++-------------------------
1 file changed, 125 insertions(+), 244 deletions(-)
--- a/drivers/net/phy/marvell.c
+++ b/drivers/net/phy/marvell.c
@@ -2141,6 +2141,19 @@ static int marvell_vct7_cable_test_get_s
}
#ifdef CONFIG_HWMON
+struct marvell_hwmon_ops {
+ int (*get_temp)(struct phy_device *phydev, long *temp);
+ int (*get_temp_critical)(struct phy_device *phydev, long *temp);
+ int (*set_temp_critical)(struct phy_device *phydev, long temp);
+ int (*get_temp_alarm)(struct phy_device *phydev, long *alarm);
+};
+
+static const struct marvell_hwmon_ops *
+to_marvell_hwmon_ops(const struct phy_device *phydev)
+{
+ return phydev->drv->driver_data;
+}
+
static int m88e1121_get_temp(struct phy_device *phydev, long *temp)
{
int oldpage;
@@ -2184,75 +2197,6 @@ error:
return phy_restore_page(phydev, oldpage, ret);
}
-static int m88e1121_hwmon_read(struct device *dev,
- enum hwmon_sensor_types type,
- u32 attr, int channel, long *temp)
-{
- struct phy_device *phydev = dev_get_drvdata(dev);
- int err;
-
- switch (attr) {
- case hwmon_temp_input:
- err = m88e1121_get_temp(phydev, temp);
- break;
- default:
- return -EOPNOTSUPP;
- }
-
- return err;
-}
-
-static umode_t m88e1121_hwmon_is_visible(const void *data,
- enum hwmon_sensor_types type,
- u32 attr, int channel)
-{
- if (type != hwmon_temp)
- return 0;
-
- switch (attr) {
- case hwmon_temp_input:
- return 0444;
- default:
- return 0;
- }
-}
-
-static u32 m88e1121_hwmon_chip_config[] = {
- HWMON_C_REGISTER_TZ,
- 0
-};
-
-static const struct hwmon_channel_info m88e1121_hwmon_chip = {
- .type = hwmon_chip,
- .config = m88e1121_hwmon_chip_config,
-};
-
-static u32 m88e1121_hwmon_temp_config[] = {
- HWMON_T_INPUT,
- 0
-};
-
-static const struct hwmon_channel_info m88e1121_hwmon_temp = {
- .type = hwmon_temp,
- .config = m88e1121_hwmon_temp_config,
-};
-
-static const struct hwmon_channel_info *m88e1121_hwmon_info[] = {
- &m88e1121_hwmon_chip,
- &m88e1121_hwmon_temp,
- NULL
-};
-
-static const struct hwmon_ops m88e1121_hwmon_hwmon_ops = {
- .is_visible = m88e1121_hwmon_is_visible,
- .read = m88e1121_hwmon_read,
-};
-
-static const struct hwmon_chip_info m88e1121_hwmon_chip_info = {
- .ops = &m88e1121_hwmon_hwmon_ops,
- .info = m88e1121_hwmon_info,
-};
-
static int m88e1510_get_temp(struct phy_device *phydev, long *temp)
{
int ret;
@@ -2315,92 +2259,6 @@ static int m88e1510_get_temp_alarm(struc
return 0;
}
-static int m88e1510_hwmon_read(struct device *dev,
- enum hwmon_sensor_types type,
- u32 attr, int channel, long *temp)
-{
- struct phy_device *phydev = dev_get_drvdata(dev);
- int err;
-
- switch (attr) {
- case hwmon_temp_input:
- err = m88e1510_get_temp(phydev, temp);
- break;
- case hwmon_temp_crit:
- err = m88e1510_get_temp_critical(phydev, temp);
- break;
- case hwmon_temp_max_alarm:
- err = m88e1510_get_temp_alarm(phydev, temp);
- break;
- default:
- return -EOPNOTSUPP;
- }
-
- return err;
-}
-
-static int m88e1510_hwmon_write(struct device *dev,
- enum hwmon_sensor_types type,
- u32 attr, int channel, long temp)
-{
- struct phy_device *phydev = dev_get_drvdata(dev);
- int err;
-
- switch (attr) {
- case hwmon_temp_crit:
- err = m88e1510_set_temp_critical(phydev, temp);
- break;
- default:
- return -EOPNOTSUPP;
- }
- return err;
-}
-
-static umode_t m88e1510_hwmon_is_visible(const void *data,
- enum hwmon_sensor_types type,
- u32 attr, int channel)
-{
- if (type != hwmon_temp)
- return 0;
-
- switch (attr) {
- case hwmon_temp_input:
- case hwmon_temp_max_alarm:
- return 0444;
- case hwmon_temp_crit:
- return 0644;
- default:
- return 0;
- }
-}
-
-static u32 m88e1510_hwmon_temp_config[] = {
- HWMON_T_INPUT | HWMON_T_CRIT | HWMON_T_MAX_ALARM,
- 0
-};
-
-static const struct hwmon_channel_info m88e1510_hwmon_temp = {
- .type = hwmon_temp,
- .config = m88e1510_hwmon_temp_config,
-};
-
-static const struct hwmon_channel_info *m88e1510_hwmon_info[] = {
- &m88e1121_hwmon_chip,
- &m88e1510_hwmon_temp,
- NULL
-};
-
-static const struct hwmon_ops m88e1510_hwmon_hwmon_ops = {
- .is_visible = m88e1510_hwmon_is_visible,
- .read = m88e1510_hwmon_read,
- .write = m88e1510_hwmon_write,
-};
-
-static const struct hwmon_chip_info m88e1510_hwmon_chip_info = {
- .ops = &m88e1510_hwmon_hwmon_ops,
- .info = m88e1510_hwmon_info,
-};
-
static int m88e6390_get_temp(struct phy_device *phydev, long *temp)
{
int sum = 0;
@@ -2459,63 +2317,112 @@ error:
return ret;
}
-static int m88e6390_hwmon_read(struct device *dev,
- enum hwmon_sensor_types type,
- u32 attr, int channel, long *temp)
+static int marvell_hwmon_read(struct device *dev, enum hwmon_sensor_types type,
+ u32 attr, int channel, long *temp)
{
struct phy_device *phydev = dev_get_drvdata(dev);
- int err;
+ const struct marvell_hwmon_ops *ops = to_marvell_hwmon_ops(phydev);
+ int err = -EOPNOTSUPP;
switch (attr) {
case hwmon_temp_input:
- err = m88e6390_get_temp(phydev, temp);
+ if (ops->get_temp)
+ err = ops->get_temp(phydev, temp);
+ break;
+ case hwmon_temp_crit:
+ if (ops->get_temp_critical)
+ err = ops->get_temp_critical(phydev, temp);
+ break;
+ case hwmon_temp_max_alarm:
+ if (ops->get_temp_alarm)
+ err = ops->get_temp_alarm(phydev, temp);
+ break;
+ }
+
+ return err;
+}
+
+static int marvell_hwmon_write(struct device *dev, enum hwmon_sensor_types type,
+ u32 attr, int channel, long temp)
+{
+ struct phy_device *phydev = dev_get_drvdata(dev);
+ const struct marvell_hwmon_ops *ops = to_marvell_hwmon_ops(phydev);
+ int err = -EOPNOTSUPP;
+
+ switch (attr) {
+ case hwmon_temp_crit:
+ if (ops->set_temp_critical)
+ err = ops->set_temp_critical(phydev, temp);
break;
default:
- return -EOPNOTSUPP;
+ fallthrough;
}
return err;
}
-static umode_t m88e6390_hwmon_is_visible(const void *data,
- enum hwmon_sensor_types type,
- u32 attr, int channel)
+static umode_t marvell_hwmon_is_visible(const void *data,
+ enum hwmon_sensor_types type,
+ u32 attr, int channel)
{
+ const struct phy_device *phydev = data;
+ const struct marvell_hwmon_ops *ops = to_marvell_hwmon_ops(phydev);
+
if (type != hwmon_temp)
return 0;
switch (attr) {
case hwmon_temp_input:
- return 0444;
+ return ops->get_temp ? 0444 : 0;
+ case hwmon_temp_max_alarm:
+ return ops->get_temp_alarm ? 0444 : 0;
+ case hwmon_temp_crit:
+ return (ops->get_temp_critical ? 0444 : 0) |
+ (ops->set_temp_critical ? 0200 : 0);
default:
return 0;
}
}
-static u32 m88e6390_hwmon_temp_config[] = {
- HWMON_T_INPUT,
+static u32 marvell_hwmon_chip_config[] = {
+ HWMON_C_REGISTER_TZ,
0
};
-static const struct hwmon_channel_info m88e6390_hwmon_temp = {
+static const struct hwmon_channel_info marvell_hwmon_chip = {
+ .type = hwmon_chip,
+ .config = marvell_hwmon_chip_config,
+};
+
+/* we can define HWMON_T_CRIT and HWMON_T_MAX_ALARM even though these are not
+ * defined for all PHYs, because the hwmon code checks whether the attributes
+ * exists via the .is_visible method
+ */
+static u32 marvell_hwmon_temp_config[] = {
+ HWMON_T_INPUT | HWMON_T_CRIT | HWMON_T_MAX_ALARM,
+ 0
+};
+
+static const struct hwmon_channel_info marvell_hwmon_temp = {
.type = hwmon_temp,
- .config = m88e6390_hwmon_temp_config,
+ .config = marvell_hwmon_temp_config,
};
-static const struct hwmon_channel_info *m88e6390_hwmon_info[] = {
- &m88e1121_hwmon_chip,
- &m88e6390_hwmon_temp,
+static const struct hwmon_channel_info *marvell_hwmon_info[] = {
+ &marvell_hwmon_chip,
+ &marvell_hwmon_temp,
NULL
};
-static const struct hwmon_ops m88e6390_hwmon_hwmon_ops = {
- .is_visible = m88e6390_hwmon_is_visible,
- .read = m88e6390_hwmon_read,
+static const struct hwmon_ops marvell_hwmon_hwmon_ops = {
+ .is_visible = marvell_hwmon_is_visible,
+ .read = marvell_hwmon_read,
+ .write = marvell_hwmon_write,
};
-static const struct hwmon_chip_info m88e6390_hwmon_chip_info = {
- .ops = &m88e6390_hwmon_hwmon_ops,
- .info = m88e6390_hwmon_info,
+static const struct hwmon_chip_info marvell_hwmon_chip_info = {
+ .ops = &marvell_hwmon_hwmon_ops,
+ .info = marvell_hwmon_info,
};
static int marvell_hwmon_name(struct phy_device *phydev)
@@ -2538,49 +2445,48 @@ static int marvell_hwmon_name(struct phy
return 0;
}
-static int marvell_hwmon_probe(struct phy_device *phydev,
- const struct hwmon_chip_info *chip)
+static int marvell_hwmon_probe(struct phy_device *phydev)
{
+ const struct marvell_hwmon_ops *ops = to_marvell_hwmon_ops(phydev);
struct marvell_priv *priv = phydev->priv;
struct device *dev = &phydev->mdio.dev;
int err;
+ if (!ops)
+ return 0;
+
err = marvell_hwmon_name(phydev);
if (err)
return err;
priv->hwmon_dev = devm_hwmon_device_register_with_info(
- dev, priv->hwmon_name, phydev, chip, NULL);
+ dev, priv->hwmon_name, phydev, &marvell_hwmon_chip_info, NULL);
return PTR_ERR_OR_ZERO(priv->hwmon_dev);
}
-static int m88e1121_hwmon_probe(struct phy_device *phydev)
-{
- return marvell_hwmon_probe(phydev, &m88e1121_hwmon_chip_info);
-}
+static const struct marvell_hwmon_ops m88e1121_hwmon_ops = {
+ .get_temp = m88e1121_get_temp,
+};
-static int m88e1510_hwmon_probe(struct phy_device *phydev)
-{
- return marvell_hwmon_probe(phydev, &m88e1510_hwmon_chip_info);
-}
+static const struct marvell_hwmon_ops m88e1510_hwmon_ops = {
+ .get_temp = m88e1510_get_temp,
+ .get_temp_critical = m88e1510_get_temp_critical,
+ .set_temp_critical = m88e1510_set_temp_critical,
+ .get_temp_alarm = m88e1510_get_temp_alarm,
+};
+
+static const struct marvell_hwmon_ops m88e6390_hwmon_ops = {
+ .get_temp = m88e6390_get_temp,
+};
+
+#define DEF_MARVELL_HWMON_OPS(s) (&(s))
-static int m88e6390_hwmon_probe(struct phy_device *phydev)
-{
- return marvell_hwmon_probe(phydev, &m88e6390_hwmon_chip_info);
-}
#else
-static int m88e1121_hwmon_probe(struct phy_device *phydev)
-{
- return 0;
-}
-static int m88e1510_hwmon_probe(struct phy_device *phydev)
-{
- return 0;
-}
+#define DEF_MARVELL_HWMON_OPS(s) NULL
-static int m88e6390_hwmon_probe(struct phy_device *phydev)
+static int marvell_hwmon_probe(struct phy_device *phydev)
{
return 0;
}
@@ -2596,40 +2502,7 @@ static int marvell_probe(struct phy_devi
phydev->priv = priv;
- return 0;
-}
-
-static int m88e1121_probe(struct phy_device *phydev)
-{
- int err;
-
- err = marvell_probe(phydev);
- if (err)
- return err;
-
- return m88e1121_hwmon_probe(phydev);
-}
-
-static int m88e1510_probe(struct phy_device *phydev)
-{
- int err;
-
- err = marvell_probe(phydev);
- if (err)
- return err;
-
- return m88e1510_hwmon_probe(phydev);
-}
-
-static int m88e6390_probe(struct phy_device *phydev)
-{
- int err;
-
- err = marvell_probe(phydev);
- if (err)
- return err;
-
- return m88e6390_hwmon_probe(phydev);
+ return marvell_hwmon_probe(phydev);
}
static struct phy_driver marvell_drivers[] = {
@@ -2714,8 +2587,9 @@ static struct phy_driver marvell_drivers
.phy_id = MARVELL_PHY_ID_88E1121R,
.phy_id_mask = MARVELL_PHY_ID_MASK,
.name = "Marvell 88E1121R",
+ .driver_data = DEF_MARVELL_HWMON_OPS(m88e1121_hwmon_ops),
/* PHY_GBIT_FEATURES */
- .probe = m88e1121_probe,
+ .probe = marvell_probe,
.config_init = marvell_config_init,
.config_aneg = m88e1121_config_aneg,
.read_status = marvell_read_status,
@@ -2834,9 +2708,10 @@ static struct phy_driver marvell_drivers
.phy_id = MARVELL_PHY_ID_88E1510,
.phy_id_mask = MARVELL_PHY_ID_MASK,
.name = "Marvell 88E1510",
+ .driver_data = DEF_MARVELL_HWMON_OPS(m88e1510_hwmon_ops),
.features = PHY_GBIT_FIBRE_FEATURES,
.flags = PHY_POLL_CABLE_TEST,
- .probe = m88e1510_probe,
+ .probe = marvell_probe,
.config_init = m88e1510_config_init,
.config_aneg = m88e1510_config_aneg,
.read_status = marvell_read_status,
@@ -2863,9 +2738,10 @@ static struct phy_driver marvell_drivers
.phy_id = MARVELL_PHY_ID_88E1540,
.phy_id_mask = MARVELL_PHY_ID_MASK,
.name = "Marvell 88E1540",
+ .driver_data = DEF_MARVELL_HWMON_OPS(m88e1510_hwmon_ops),
/* PHY_GBIT_FEATURES */
.flags = PHY_POLL_CABLE_TEST,
- .probe = m88e1510_probe,
+ .probe = marvell_probe,
.config_init = marvell_config_init,
.config_aneg = m88e1510_config_aneg,
.read_status = marvell_read_status,
@@ -2889,7 +2765,8 @@ static struct phy_driver marvell_drivers
.phy_id = MARVELL_PHY_ID_88E1545,
.phy_id_mask = MARVELL_PHY_ID_MASK,
.name = "Marvell 88E1545",
- .probe = m88e1510_probe,
+ .driver_data = DEF_MARVELL_HWMON_OPS(m88e1510_hwmon_ops),
+ .probe = marvell_probe,
/* PHY_GBIT_FEATURES */
.flags = PHY_POLL_CABLE_TEST,
.config_init = marvell_config_init,
@@ -2935,9 +2812,10 @@ static struct phy_driver marvell_drivers
.phy_id = MARVELL_PHY_ID_88E6341_FAMILY,
.phy_id_mask = MARVELL_PHY_ID_MASK,
.name = "Marvell 88E6341 Family",
+ .driver_data = DEF_MARVELL_HWMON_OPS(m88e1510_hwmon_ops),
/* PHY_GBIT_FEATURES */
.flags = PHY_POLL_CABLE_TEST,
- .probe = m88e1510_probe,
+ .probe = marvell_probe,
.config_init = marvell_config_init,
.config_aneg = m88e6390_config_aneg,
.read_status = marvell_read_status,
@@ -2961,9 +2839,10 @@ static struct phy_driver marvell_drivers
.phy_id = MARVELL_PHY_ID_88E6390_FAMILY,
.phy_id_mask = MARVELL_PHY_ID_MASK,
.name = "Marvell 88E6390 Family",
+ .driver_data = DEF_MARVELL_HWMON_OPS(m88e6390_hwmon_ops),
/* PHY_GBIT_FEATURES */
.flags = PHY_POLL_CABLE_TEST,
- .probe = m88e6390_probe,
+ .probe = marvell_probe,
.config_init = marvell_config_init,
.config_aneg = m88e6390_config_aneg,
.read_status = marvell_read_status,
@@ -2987,7 +2866,8 @@ static struct phy_driver marvell_drivers
.phy_id = MARVELL_PHY_ID_88E1340S,
.phy_id_mask = MARVELL_PHY_ID_MASK,
.name = "Marvell 88E1340S",
- .probe = m88e1510_probe,
+ .driver_data = DEF_MARVELL_HWMON_OPS(m88e1510_hwmon_ops),
+ .probe = marvell_probe,
/* PHY_GBIT_FEATURES */
.config_init = marvell_config_init,
.config_aneg = m88e1510_config_aneg,
@@ -3009,7 +2889,8 @@ static struct phy_driver marvell_drivers
.phy_id = MARVELL_PHY_ID_88E1548P,
.phy_id_mask = MARVELL_PHY_ID_MASK,
.name = "Marvell 88E1548P",
- .probe = m88e1510_probe,
+ .driver_data = DEF_MARVELL_HWMON_OPS(m88e1510_hwmon_ops),
+ .probe = marvell_probe,
.features = PHY_GBIT_FIBRE_FEATURES,
.config_init = marvell_config_init,
.config_aneg = m88e1510_config_aneg,

View File

@ -1,161 +0,0 @@
From b697d9d38a5a5ab405d7cc4743d39fe2c5d7517c Mon Sep 17 00:00:00 2001
From: Ivan Bornyakov <i.bornyakov@metrotek.ru>
Date: Thu, 12 Aug 2021 16:42:56 +0300
Subject: [PATCH] net: phy: marvell: add SFP support for 88E1510
Add support for SFP cages connected to the Marvell 88E1512 transceiver.
88E1512 supports for SGMII/1000Base-X/100Base-FX media type with RGMII
on system interface. Configure PHY to appropriate mode depending on the
type of SFP inserted. On SFP removal configure PHY to the RGMII-copper
mode so RJ-45 port can still work.
Signed-off-by: Ivan Bornyakov <i.bornyakov@metrotek.ru>
Link: https://lore.kernel.org/r/20210812134256.2436-1-i.bornyakov@metrotek.ru
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
drivers/net/phy/marvell.c | 105 +++++++++++++++++++++++++++++++++++++-
1 file changed, 104 insertions(+), 1 deletion(-)
--- a/drivers/net/phy/marvell.c
+++ b/drivers/net/phy/marvell.c
@@ -32,6 +32,7 @@
#include <linux/marvell_phy.h>
#include <linux/bitfield.h>
#include <linux/of.h>
+#include <linux/sfp.h>
#include <linux/io.h>
#include <asm/irq.h>
@@ -46,6 +47,7 @@
#define MII_MARVELL_MISC_TEST_PAGE 0x06
#define MII_MARVELL_VCT7_PAGE 0x07
#define MII_MARVELL_WOL_PAGE 0x11
+#define MII_MARVELL_MODE_PAGE 0x12
#define MII_M1011_IEVENT 0x13
#define MII_M1011_IEVENT_CLEAR 0x0000
@@ -162,7 +164,14 @@
#define MII_88E1510_GEN_CTRL_REG_1 0x14
#define MII_88E1510_GEN_CTRL_REG_1_MODE_MASK 0x7
+#define MII_88E1510_GEN_CTRL_REG_1_MODE_RGMII 0x0 /* RGMII to copper */
#define MII_88E1510_GEN_CTRL_REG_1_MODE_SGMII 0x1 /* SGMII to copper */
+/* RGMII to 1000BASE-X */
+#define MII_88E1510_GEN_CTRL_REG_1_MODE_RGMII_1000X 0x2
+/* RGMII to 100BASE-FX */
+#define MII_88E1510_GEN_CTRL_REG_1_MODE_RGMII_100FX 0x3
+/* RGMII to SGMII */
+#define MII_88E1510_GEN_CTRL_REG_1_MODE_RGMII_SGMII 0x4
#define MII_88E1510_GEN_CTRL_REG_1_RESET 0x8000 /* Soft reset */
#define MII_VCT5_TX_RX_MDI0_COUPLING 0x10
@@ -2505,6 +2514,100 @@ static int marvell_probe(struct phy_devi
return marvell_hwmon_probe(phydev);
}
+static int m88e1510_sfp_insert(void *upstream, const struct sfp_eeprom_id *id)
+{
+ struct phy_device *phydev = upstream;
+ phy_interface_t interface;
+ struct device *dev;
+ int oldpage;
+ int ret = 0;
+ u16 mode;
+
+ __ETHTOOL_DECLARE_LINK_MODE_MASK(supported) = { 0, };
+
+ dev = &phydev->mdio.dev;
+
+ sfp_parse_support(phydev->sfp_bus, id, supported);
+ interface = sfp_select_interface(phydev->sfp_bus, supported);
+
+ dev_info(dev, "%s SFP module inserted\n", phy_modes(interface));
+
+ switch (interface) {
+ case PHY_INTERFACE_MODE_1000BASEX:
+ mode = MII_88E1510_GEN_CTRL_REG_1_MODE_RGMII_1000X;
+
+ break;
+ case PHY_INTERFACE_MODE_100BASEX:
+ mode = MII_88E1510_GEN_CTRL_REG_1_MODE_RGMII_100FX;
+
+ break;
+ case PHY_INTERFACE_MODE_SGMII:
+ mode = MII_88E1510_GEN_CTRL_REG_1_MODE_RGMII_SGMII;
+
+ break;
+ default:
+ dev_err(dev, "Incompatible SFP module inserted\n");
+
+ return -EINVAL;
+ }
+
+ oldpage = phy_select_page(phydev, MII_MARVELL_MODE_PAGE);
+ if (oldpage < 0)
+ goto error;
+
+ ret = __phy_modify(phydev, MII_88E1510_GEN_CTRL_REG_1,
+ MII_88E1510_GEN_CTRL_REG_1_MODE_MASK, mode);
+ if (ret < 0)
+ goto error;
+
+ ret = __phy_set_bits(phydev, MII_88E1510_GEN_CTRL_REG_1,
+ MII_88E1510_GEN_CTRL_REG_1_RESET);
+
+error:
+ return phy_restore_page(phydev, oldpage, ret);
+}
+
+static void m88e1510_sfp_remove(void *upstream)
+{
+ struct phy_device *phydev = upstream;
+ int oldpage;
+ int ret = 0;
+
+ oldpage = phy_select_page(phydev, MII_MARVELL_MODE_PAGE);
+ if (oldpage < 0)
+ goto error;
+
+ ret = __phy_modify(phydev, MII_88E1510_GEN_CTRL_REG_1,
+ MII_88E1510_GEN_CTRL_REG_1_MODE_MASK,
+ MII_88E1510_GEN_CTRL_REG_1_MODE_RGMII);
+ if (ret < 0)
+ goto error;
+
+ ret = __phy_set_bits(phydev, MII_88E1510_GEN_CTRL_REG_1,
+ MII_88E1510_GEN_CTRL_REG_1_RESET);
+
+error:
+ phy_restore_page(phydev, oldpage, ret);
+}
+
+static const struct sfp_upstream_ops m88e1510_sfp_ops = {
+ .module_insert = m88e1510_sfp_insert,
+ .module_remove = m88e1510_sfp_remove,
+ .attach = phy_sfp_attach,
+ .detach = phy_sfp_detach,
+};
+
+static int m88e1510_probe(struct phy_device *phydev)
+{
+ int err;
+
+ err = marvell_probe(phydev);
+ if (err)
+ return err;
+
+ return phy_sfp_probe(phydev, &m88e1510_sfp_ops);
+}
+
static struct phy_driver marvell_drivers[] = {
{
.phy_id = MARVELL_PHY_ID_88E1101,
@@ -2711,7 +2814,7 @@ static struct phy_driver marvell_drivers
.driver_data = DEF_MARVELL_HWMON_OPS(m88e1510_hwmon_ops),
.features = PHY_GBIT_FIBRE_FEATURES,
.flags = PHY_POLL_CABLE_TEST,
- .probe = marvell_probe,
+ .probe = m88e1510_probe,
.config_init = m88e1510_config_init,
.config_aneg = m88e1510_config_aneg,
.read_status = marvell_read_status,

View File

@ -1,85 +0,0 @@
From 9d5ef190e5615a7b63af89f88c4106a5bc127974 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Fri, 5 Feb 2021 15:37:10 +0200
Subject: [PATCH] net: dsa: automatically bring up DSA master when opening user
port
DSA wants the master interface to be open before the user port is due to
historical reasons. The promiscuity of interfaces that are down used to
have issues, as referenced Lennert Buytenhek in commit df02c6ff2e39
("dsa: fix master interface allmulti/promisc handling").
The bugfix mentioned there, commit b6c40d68ff64 ("net: only invoke
dev->change_rx_flags when device is UP"), was basically a "don't do
that" approach to working around the promiscuity while down issue.
Further work done by Vlad Yasevich in commit d2615bf45069 ("net: core:
Always propagate flag changes to interfaces") has resolved the
underlying issue, and it is strictly up to the DSA and 8021q drivers
now, it is no longer mandated by the networking core that the master
interface must be up when changing its promiscuity.
From DSA's point of view, deciding to error out in dsa_slave_open
because the master isn't up is
(a) a bad user experience and
(b) knocking at an open door.
Even if there still was an issue with promiscuity while down, DSA could
still just open the master and avoid it.
Doing it this way has the additional benefit that user space can now
remove DSA-specific workarounds, like systemd-networkd with BindCarrier:
https://github.com/systemd/systemd/issues/7478
And we can finally remove one of the 2 bullets in the "Common pitfalls
using DSA setups" chapter.
Tested with two cascaded DSA switches:
$ ip link set sw0p2 up
fsl_enetc 0000:00:00.2 eno2: configuring for fixed/internal link mode
fsl_enetc 0000:00:00.2 eno2: Link is Up - 1Gbps/Full - flow control rx/tx
mscc_felix 0000:00:00.5 swp0: configuring for fixed/sgmii link mode
mscc_felix 0000:00:00.5 swp0: Link is Up - 1Gbps/Full - flow control off
8021q: adding VLAN 0 to HW filter on device swp0
sja1105 spi2.0 sw0p2: configuring for phy/rgmii-id link mode
IPv6: ADDRCONF(NETDEV_CHANGE): eno2: link becomes ready
IPv6: ADDRCONF(NETDEV_CHANGE): swp0: link becomes ready
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
Documentation/networking/dsa/dsa.rst | 4 ----
net/dsa/slave.c | 7 +++++--
2 files changed, 5 insertions(+), 6 deletions(-)
--- a/Documentation/networking/dsa/dsa.rst
+++ b/Documentation/networking/dsa/dsa.rst
@@ -273,10 +273,6 @@ will not make us go through the switch t
the Ethernet switch on the other end, expecting a tag will typically drop this
frame.
-Slave network devices check that the master network device is UP before allowing
-you to administratively bring UP these slave network devices. A common
-configuration mistake is forgetting to bring UP the master network device first.
-
Interactions with other subsystems
==================================
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -68,8 +68,11 @@ static int dsa_slave_open(struct net_dev
struct dsa_port *dp = dsa_slave_to_port(dev);
int err;
- if (!(master->flags & IFF_UP))
- return -ENETDOWN;
+ err = dev_open(master, NULL);
+ if (err < 0) {
+ netdev_err(dev, "failed to open master %s\n", master->name);
+ goto out;
+ }
if (!ether_addr_equal(dev->dev_addr, master->dev_addr)) {
err = dev_uc_add(master, dev->dev_addr);

View File

@ -1,126 +0,0 @@
From 90dc8fd36078a536671adae884d0b929cce6480a Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Wed, 6 Jan 2021 11:51:30 +0200
Subject: [PATCH] net: bridge: notify switchdev of disappearance of old FDB
entry upon migration
Currently the bridge emits atomic switchdev notifications for
dynamically learnt FDB entries. Monitoring these notifications works
wonders for switchdev drivers that want to keep their hardware FDB in
sync with the bridge's FDB.
For example station A wants to talk to station B in the diagram below,
and we are concerned with the behavior of the bridge on the DUT device:
DUT
+-------------------------------------+
| br0 |
| +------+ +------+ +------+ +------+ |
| | | | | | | | | |
| | swp0 | | swp1 | | swp2 | | eth0 | |
+-------------------------------------+
| | |
Station A | |
| |
+--+------+--+ +--+------+--+
| | | | | | | |
| | swp0 | | | | swp0 | |
Another | +------+ | | +------+ | Another
switch | br0 | | br0 | switch
| +------+ | | +------+ |
| | | | | | | |
| | swp1 | | | | swp1 | |
+--+------+--+ +--+------+--+
|
Station B
Interfaces swp0, swp1, swp2 are handled by a switchdev driver that has
the following property: frames injected from its control interface bypass
the internal address analyzer logic, and therefore, this hardware does
not learn from the source address of packets transmitted by the network
stack through it. So, since bridging between eth0 (where Station B is
attached) and swp0 (where Station A is attached) is done in software,
the switchdev hardware will never learn the source address of Station B.
So the traffic towards that destination will be treated as unknown, i.e.
flooded.
This is where the bridge notifications come in handy. When br0 on the
DUT sees frames with Station B's MAC address on eth0, the switchdev
driver gets these notifications and can install a rule to send frames
towards Station B's address that are incoming from swp0, swp1, swp2,
only towards the control interface. This is all switchdev driver private
business, which the notification makes possible.
All is fine until someone unplugs Station B's cable and moves it to the
other switch:
DUT
+-------------------------------------+
| br0 |
| +------+ +------+ +------+ +------+ |
| | | | | | | | | |
| | swp0 | | swp1 | | swp2 | | eth0 | |
+-------------------------------------+
| | |
Station A | |
| |
+--+------+--+ +--+------+--+
| | | | | | | |
| | swp0 | | | | swp0 | |
Another | +------+ | | +------+ | Another
switch | br0 | | br0 | switch
| +------+ | | +------+ |
| | | | | | | |
| | swp1 | | | | swp1 | |
+--+------+--+ +--+------+--+
|
Station B
Luckily for the use cases we care about, Station B is noisy enough that
the DUT hears it (on swp1 this time). swp1 receives the frames and
delivers them to the bridge, who enters the unlikely path in br_fdb_update
of updating an existing entry. It moves the entry in the software bridge
to swp1 and emits an addition notification towards that.
As far as the switchdev driver is concerned, all that it needs to ensure
is that traffic between Station A and Station B is not forever broken.
If it does nothing, then the stale rule to send frames for Station B
towards the control interface remains in place. But Station B is no
longer reachable via the control interface, but via a port that can
offload the bridge port learning attribute. It's just that the port is
prevented from learning this address, since the rule overrides FDB
updates. So the rule needs to go. The question is via what mechanism.
It sure would be possible for this switchdev driver to keep track of all
addresses which are sent to the control interface, and then also listen
for bridge notifier events on its own ports, searching for the ones that
have a MAC address which was previously sent to the control interface.
But this is cumbersome and inefficient. Instead, with one small change,
the bridge could notify of the address deletion from the old port, in a
symmetrical manner with how it did for the insertion. Then the switchdev
driver would not be required to monitor learn/forget events for its own
ports. It could just delete the rule towards the control interface upon
bridge entry migration. This would make hardware address learning be
possible again. Then it would take a few more packets until the hardware
and software FDB would be in sync again.
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Acked-by: Nikolay Aleksandrov <nikolay@nvidia.com>
Reviewed-by: Ido Schimmel <idosch@nvidia.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
net/bridge/br_fdb.c | 1 +
1 file changed, 1 insertion(+)
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -602,6 +602,7 @@ void br_fdb_update(struct net_bridge *br
/* fastpath: update of existing entry */
if (unlikely(source != fdb->dst &&
!test_bit(BR_FDB_STICKY, &fdb->flags))) {
+ br_switchdev_fdb_notify(fdb, RTM_DELNEIGH);
fdb->dst = source;
fdb_modified = true;
/* Take over HW learned entry */

View File

@ -1,52 +0,0 @@
From 2fd186501b1cff155cc4a755c210793cfc0dffb5 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Wed, 6 Jan 2021 11:51:31 +0200
Subject: [PATCH] net: dsa: be louder when a non-legacy FDB operation fails
The dev_close() call was added in commit c9eb3e0f8701 ("net: dsa: Add
support for learning FDB through notification") "to indicate inconsistent
situation" when we could not delete an FDB entry from the port.
bridge fdb del d8:58:d7:00:ca:6d dev swp0 self master
It is a bit drastic and at the same time not helpful if the above fails
to only print with netdev_dbg log level, but on the other hand to bring
the interface down.
So increase the verbosity of the error message, and drop dev_close().
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
net/dsa/slave.c | 10 +++++++---
1 file changed, 7 insertions(+), 3 deletions(-)
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -2112,7 +2112,9 @@ static void dsa_slave_switchdev_event_wo
err = dsa_port_fdb_add(dp, fdb_info->addr, fdb_info->vid);
if (err) {
- netdev_dbg(dev, "fdb add failed err=%d\n", err);
+ netdev_err(dev,
+ "failed to add %pM vid %d to fdb: %d\n",
+ fdb_info->addr, fdb_info->vid, err);
break;
}
fdb_info->offloaded = true;
@@ -2127,9 +2129,11 @@ static void dsa_slave_switchdev_event_wo
err = dsa_port_fdb_del(dp, fdb_info->addr, fdb_info->vid);
if (err) {
- netdev_dbg(dev, "fdb del failed err=%d\n", err);
- dev_close(dev);
+ netdev_err(dev,
+ "failed to delete %pM vid %d from fdb: %d\n",
+ fdb_info->addr, fdb_info->vid, err);
}
+
break;
}
rtnl_unlock();

View File

@ -1,226 +0,0 @@
From c4bb76a9a0ef87c4cc1f636defed5f12deb9f5a7 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Wed, 6 Jan 2021 11:51:32 +0200
Subject: [PATCH] net: dsa: don't use switchdev_notifier_fdb_info in
dsa_switchdev_event_work
Currently DSA doesn't add FDB entries on the CPU port, because it only
does so through switchdev, which is associated with a net_device, and
there are none of those for the CPU port.
But actually FDB addresses on the CPU port have some use cases of their
own, if the switchdev operations are initiated from within the DSA
layer. There is just one problem with the existing code: it passes a
structure in dsa_switchdev_event_work which was retrieved directly from
switchdev, so it contains a net_device. We need to generalize the
contents to something that covers the CPU port as well: the "ds, port"
tuple is fine for that.
Note that the new procedure for notifying the successful FDB offload is
inspired from the rocker model.
Also, nothing was being done if added_by_user was false. Let's check for
that a lot earlier, and don't actually bother to schedule the worker
for nothing.
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
net/dsa/dsa_priv.h | 12 +++++
net/dsa/slave.c | 106 ++++++++++++++++++++++-----------------------
2 files changed, 65 insertions(+), 53 deletions(-)
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -73,6 +73,18 @@ struct dsa_notifier_mtu_info {
int mtu;
};
+struct dsa_switchdev_event_work {
+ struct dsa_switch *ds;
+ int port;
+ struct work_struct work;
+ unsigned long event;
+ /* Specific for SWITCHDEV_FDB_ADD_TO_DEVICE and
+ * SWITCHDEV_FDB_DEL_TO_DEVICE
+ */
+ unsigned char addr[ETH_ALEN];
+ u16 vid;
+};
+
struct dsa_slave_priv {
/* Copy of CPU port xmit for faster access in slave transmit hot path */
struct sk_buff * (*xmit)(struct sk_buff *skb,
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -2087,76 +2087,66 @@ static int dsa_slave_netdevice_event(str
return NOTIFY_DONE;
}
-struct dsa_switchdev_event_work {
- struct work_struct work;
- struct switchdev_notifier_fdb_info fdb_info;
- struct net_device *dev;
- unsigned long event;
-};
+static void
+dsa_fdb_offload_notify(struct dsa_switchdev_event_work *switchdev_work)
+{
+ struct dsa_switch *ds = switchdev_work->ds;
+ struct switchdev_notifier_fdb_info info;
+ struct dsa_port *dp;
+
+ if (!dsa_is_user_port(ds, switchdev_work->port))
+ return;
+
+ info.addr = switchdev_work->addr;
+ info.vid = switchdev_work->vid;
+ info.offloaded = true;
+ dp = dsa_to_port(ds, switchdev_work->port);
+ call_switchdev_notifiers(SWITCHDEV_FDB_OFFLOADED,
+ dp->slave, &info.info, NULL);
+}
static void dsa_slave_switchdev_event_work(struct work_struct *work)
{
struct dsa_switchdev_event_work *switchdev_work =
container_of(work, struct dsa_switchdev_event_work, work);
- struct net_device *dev = switchdev_work->dev;
- struct switchdev_notifier_fdb_info *fdb_info;
- struct dsa_port *dp = dsa_slave_to_port(dev);
+ struct dsa_switch *ds = switchdev_work->ds;
+ struct dsa_port *dp;
int err;
+ dp = dsa_to_port(ds, switchdev_work->port);
+
rtnl_lock();
switch (switchdev_work->event) {
case SWITCHDEV_FDB_ADD_TO_DEVICE:
- fdb_info = &switchdev_work->fdb_info;
- if (!fdb_info->added_by_user)
- break;
-
- err = dsa_port_fdb_add(dp, fdb_info->addr, fdb_info->vid);
+ err = dsa_port_fdb_add(dp, switchdev_work->addr,
+ switchdev_work->vid);
if (err) {
- netdev_err(dev,
- "failed to add %pM vid %d to fdb: %d\n",
- fdb_info->addr, fdb_info->vid, err);
+ dev_err(ds->dev,
+ "port %d failed to add %pM vid %d to fdb: %d\n",
+ dp->index, switchdev_work->addr,
+ switchdev_work->vid, err);
break;
}
- fdb_info->offloaded = true;
- call_switchdev_notifiers(SWITCHDEV_FDB_OFFLOADED, dev,
- &fdb_info->info, NULL);
+ dsa_fdb_offload_notify(switchdev_work);
break;
case SWITCHDEV_FDB_DEL_TO_DEVICE:
- fdb_info = &switchdev_work->fdb_info;
- if (!fdb_info->added_by_user)
- break;
-
- err = dsa_port_fdb_del(dp, fdb_info->addr, fdb_info->vid);
+ err = dsa_port_fdb_del(dp, switchdev_work->addr,
+ switchdev_work->vid);
if (err) {
- netdev_err(dev,
- "failed to delete %pM vid %d from fdb: %d\n",
- fdb_info->addr, fdb_info->vid, err);
+ dev_err(ds->dev,
+ "port %d failed to delete %pM vid %d from fdb: %d\n",
+ dp->index, switchdev_work->addr,
+ switchdev_work->vid, err);
}
break;
}
rtnl_unlock();
- kfree(switchdev_work->fdb_info.addr);
kfree(switchdev_work);
- dev_put(dev);
-}
-
-static int
-dsa_slave_switchdev_fdb_work_init(struct dsa_switchdev_event_work *
- switchdev_work,
- const struct switchdev_notifier_fdb_info *
- fdb_info)
-{
- memcpy(&switchdev_work->fdb_info, fdb_info,
- sizeof(switchdev_work->fdb_info));
- switchdev_work->fdb_info.addr = kzalloc(ETH_ALEN, GFP_ATOMIC);
- if (!switchdev_work->fdb_info.addr)
- return -ENOMEM;
- ether_addr_copy((u8 *)switchdev_work->fdb_info.addr,
- fdb_info->addr);
- return 0;
+ if (dsa_is_user_port(ds, dp->index))
+ dev_put(dp->slave);
}
/* Called under rcu_read_lock() */
@@ -2164,7 +2154,9 @@ static int dsa_slave_switchdev_event(str
unsigned long event, void *ptr)
{
struct net_device *dev = switchdev_notifier_info_to_dev(ptr);
+ const struct switchdev_notifier_fdb_info *fdb_info;
struct dsa_switchdev_event_work *switchdev_work;
+ struct dsa_port *dp;
int err;
if (event == SWITCHDEV_PORT_ATTR_SET) {
@@ -2177,20 +2169,32 @@ static int dsa_slave_switchdev_event(str
if (!dsa_slave_dev_check(dev))
return NOTIFY_DONE;
+ dp = dsa_slave_to_port(dev);
+
switchdev_work = kzalloc(sizeof(*switchdev_work), GFP_ATOMIC);
if (!switchdev_work)
return NOTIFY_BAD;
INIT_WORK(&switchdev_work->work,
dsa_slave_switchdev_event_work);
- switchdev_work->dev = dev;
+ switchdev_work->ds = dp->ds;
+ switchdev_work->port = dp->index;
switchdev_work->event = event;
switch (event) {
case SWITCHDEV_FDB_ADD_TO_DEVICE:
case SWITCHDEV_FDB_DEL_TO_DEVICE:
- if (dsa_slave_switchdev_fdb_work_init(switchdev_work, ptr))
- goto err_fdb_work_init;
+ fdb_info = ptr;
+
+ if (!fdb_info->added_by_user) {
+ kfree(switchdev_work);
+ return NOTIFY_OK;
+ }
+
+ ether_addr_copy(switchdev_work->addr,
+ fdb_info->addr);
+ switchdev_work->vid = fdb_info->vid;
+
dev_hold(dev);
break;
default:
@@ -2200,10 +2204,6 @@ static int dsa_slave_switchdev_event(str
dsa_schedule_work(&switchdev_work->work);
return NOTIFY_OK;
-
-err_fdb_work_init:
- kfree(switchdev_work);
- return NOTIFY_BAD;
}
static int dsa_slave_switchdev_blocking_event(struct notifier_block *unused,

View File

@ -1,85 +0,0 @@
From 447d290a58bd335d68f665713842365d3d6447df Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Wed, 6 Jan 2021 11:51:33 +0200
Subject: [PATCH] net: dsa: move switchdev event implementation under the same
switch/case statement
We'll need to start listening to SWITCHDEV_FDB_{ADD,DEL}_TO_DEVICE
events even for interfaces where dsa_slave_dev_check returns false, so
we need that check inside the switch-case statement for SWITCHDEV_FDB_*.
This movement also avoids a useless allocation / free of switchdev_work
on the untreated "default event" case.
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
net/dsa/slave.c | 35 ++++++++++++++++-------------------
1 file changed, 16 insertions(+), 19 deletions(-)
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -2159,31 +2159,29 @@ static int dsa_slave_switchdev_event(str
struct dsa_port *dp;
int err;
- if (event == SWITCHDEV_PORT_ATTR_SET) {
+ switch (event) {
+ case SWITCHDEV_PORT_ATTR_SET:
err = switchdev_handle_port_attr_set(dev, ptr,
dsa_slave_dev_check,
dsa_slave_port_attr_set);
return notifier_from_errno(err);
- }
-
- if (!dsa_slave_dev_check(dev))
- return NOTIFY_DONE;
+ case SWITCHDEV_FDB_ADD_TO_DEVICE:
+ case SWITCHDEV_FDB_DEL_TO_DEVICE:
+ if (!dsa_slave_dev_check(dev))
+ return NOTIFY_DONE;
- dp = dsa_slave_to_port(dev);
+ dp = dsa_slave_to_port(dev);
- switchdev_work = kzalloc(sizeof(*switchdev_work), GFP_ATOMIC);
- if (!switchdev_work)
- return NOTIFY_BAD;
-
- INIT_WORK(&switchdev_work->work,
- dsa_slave_switchdev_event_work);
- switchdev_work->ds = dp->ds;
- switchdev_work->port = dp->index;
- switchdev_work->event = event;
+ switchdev_work = kzalloc(sizeof(*switchdev_work), GFP_ATOMIC);
+ if (!switchdev_work)
+ return NOTIFY_BAD;
+
+ INIT_WORK(&switchdev_work->work,
+ dsa_slave_switchdev_event_work);
+ switchdev_work->ds = dp->ds;
+ switchdev_work->port = dp->index;
+ switchdev_work->event = event;
- switch (event) {
- case SWITCHDEV_FDB_ADD_TO_DEVICE:
- case SWITCHDEV_FDB_DEL_TO_DEVICE:
fdb_info = ptr;
if (!fdb_info->added_by_user) {
@@ -2196,13 +2194,12 @@ static int dsa_slave_switchdev_event(str
switchdev_work->vid = fdb_info->vid;
dev_hold(dev);
+ dsa_schedule_work(&switchdev_work->work);
break;
default:
- kfree(switchdev_work);
return NOTIFY_DONE;
}
- dsa_schedule_work(&switchdev_work->work);
return NOTIFY_OK;
}

View File

@ -1,42 +0,0 @@
From 5fb4a451a87d8ed3363d28b63a3295399373d6c4 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Wed, 6 Jan 2021 11:51:34 +0200
Subject: [PATCH] net: dsa: exit early in dsa_slave_switchdev_event if we can't
program the FDB
Right now, the following would happen for a switch driver that does not
implement .port_fdb_add or .port_fdb_del.
dsa_slave_switchdev_event returns NOTIFY_OK and schedules:
-> dsa_slave_switchdev_event_work
-> dsa_port_fdb_add
-> dsa_port_notify(DSA_NOTIFIER_FDB_ADD)
-> dsa_switch_fdb_add
-> if (!ds->ops->port_fdb_add) return -EOPNOTSUPP;
-> an error is printed with dev_dbg, and
dsa_fdb_offload_notify(switchdev_work) is not called.
We can avoid scheduling the worker for nothing and say NOTIFY_DONE.
Because we don't call dsa_fdb_offload_notify, the static FDB entry will
remain just in the software bridge.
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
net/dsa/slave.c | 3 +++
1 file changed, 3 insertions(+)
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -2172,6 +2172,9 @@ static int dsa_slave_switchdev_event(str
dp = dsa_slave_to_port(dev);
+ if (!dp->ds->ops->port_fdb_add || !dp->ds->ops->port_fdb_del)
+ return NOTIFY_DONE;
+
switchdev_work = kzalloc(sizeof(*switchdev_work), GFP_ATOMIC);
if (!switchdev_work)
return NOTIFY_BAD;

View File

@ -1,264 +0,0 @@
From d5f19486cee79d04c054427577ac96ed123706db Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Wed, 6 Jan 2021 11:51:35 +0200
Subject: [PATCH] net: dsa: listen for SWITCHDEV_{FDB,DEL}_ADD_TO_DEVICE on
foreign bridge neighbors
Some DSA switches (and not only) cannot learn source MAC addresses from
packets injected from the CPU. They only perform hardware address
learning from inbound traffic.
This can be problematic when we have a bridge spanning some DSA switch
ports and some non-DSA ports (which we'll call "foreign interfaces" from
DSA's perspective).
There are 2 classes of problems created by the lack of learning on
CPU-injected traffic:
- excessive flooding, due to the fact that DSA treats those addresses as
unknown
- the risk of stale routes, which can lead to temporary packet loss
To illustrate the second class, consider the following situation, which
is common in production equipment (wireless access points, where there
is a WLAN interface and an Ethernet switch, and these form a single
bridging domain).
AP 1:
+------------------------------------------------------------------------+
| br0 |
+------------------------------------------------------------------------+
+------------+ +------------+ +------------+ +------------+ +------------+
| swp0 | | swp1 | | swp2 | | swp3 | | wlan0 |
+------------+ +------------+ +------------+ +------------+ +------------+
| ^ ^
| | |
| | |
| Client A Client B
|
|
|
+------------+ +------------+ +------------+ +------------+ +------------+
| swp0 | | swp1 | | swp2 | | swp3 | | wlan0 |
+------------+ +------------+ +------------+ +------------+ +------------+
+------------------------------------------------------------------------+
| br0 |
+------------------------------------------------------------------------+
AP 2
- br0 of AP 1 will know that Clients A and B are reachable via wlan0
- the hardware fdb of a DSA switch driver today is not kept in sync with
the software entries on other bridge ports, so it will not know that
clients A and B are reachable via the CPU port UNLESS the hardware
switch itself performs SA learning from traffic injected from the CPU.
Nonetheless, a substantial number of switches don't.
- the hardware fdb of the DSA switch on AP 2 may autonomously learn that
Client A and B are reachable through swp0. Therefore, the software br0
of AP 2 also may or may not learn this. In the example we're
illustrating, some Ethernet traffic has been going on, and br0 from AP
2 has indeed learnt that it can reach Client B through swp0.
One of the wireless clients, say Client B, disconnects from AP 1 and
roams to AP 2. The topology now looks like this:
AP 1:
+------------------------------------------------------------------------+
| br0 |
+------------------------------------------------------------------------+
+------------+ +------------+ +------------+ +------------+ +------------+
| swp0 | | swp1 | | swp2 | | swp3 | | wlan0 |
+------------+ +------------+ +------------+ +------------+ +------------+
| ^
| |
| Client A
|
|
| Client B
| |
| v
+------------+ +------------+ +------------+ +------------+ +------------+
| swp0 | | swp1 | | swp2 | | swp3 | | wlan0 |
+------------+ +------------+ +------------+ +------------+ +------------+
+------------------------------------------------------------------------+
| br0 |
+------------------------------------------------------------------------+
AP 2
- br0 of AP 1 still knows that Client A is reachable via wlan0 (no change)
- br0 of AP 1 will (possibly) know that Client B has left wlan0. There
are cases where it might never find out though. Either way, DSA today
does not process that notification in any way.
- the hardware FDB of the DSA switch on AP 1 may learn autonomously that
Client B can be reached via swp0, if it receives any packet with
Client 1's source MAC address over Ethernet.
- the hardware FDB of the DSA switch on AP 2 still thinks that Client B
can be reached via swp0. It does not know that it has roamed to wlan0,
because it doesn't perform SA learning from the CPU port.
Now Client A contacts Client B.
AP 1 routes the packet fine towards swp0 and delivers it on the Ethernet
segment.
AP 2 sees a frame on swp0 and its fdb says that the destination is swp0.
Hairpinning is disabled => drop.
This problem comes from the fact that these switches have a 'blind spot'
for addresses coming from software bridging. The generic solution is not
to assume that hardware learning can be enabled somehow, but to listen
to more bridge learning events. It turns out that the bridge driver does
learn in software from all inbound frames, in __br_handle_local_finish.
A proper SWITCHDEV_FDB_ADD_TO_DEVICE notification is emitted for the
addresses serviced by the bridge on 'foreign' interfaces. The software
bridge also does the right thing on migration, by notifying that the old
entry is deleted, so that does not need to be special-cased in DSA. When
it is deleted, we just need to delete our static FDB entry towards the
CPU too, and wait.
The problem is that DSA currently only cares about SWITCHDEV_FDB_ADD_TO_DEVICE
events received on its own interfaces, such as static FDB entries.
Luckily we can change that, and DSA can listen to all switchdev FDB
add/del events in the system and figure out if those events were emitted
by a bridge that spans at least one of DSA's own ports. In case that is
true, DSA will also offload that address towards its own CPU port, in
the eventuality that there might be bridge clients attached to the DSA
switch who want to talk to the station connected to the foreign
interface.
In terms of implementation, we need to keep the fdb_info->added_by_user
check for the case where the switchdev event was targeted directly at a
DSA switch port. But we don't need to look at that flag for snooped
events. So the check is currently too late, we need to move it earlier.
This also simplifies the code a bit, since we avoid uselessly allocating
and freeing switchdev_work.
We could probably do some improvements in the future. For example,
multi-bridge support is rudimentary at the moment. If there are two
bridges spanning a DSA switch's ports, and both of them need to service
the same MAC address, then what will happen is that the migration of one
of those stations will trigger the deletion of the FDB entry from the
CPU port while it is still used by other bridge. That could be improved
with reference counting but is left for another time.
This behavior needs to be enabled at driver level by setting
ds->assisted_learning_on_cpu_port = true. This is because we don't want
to inflict a potential performance penalty (accesses through
MDIO/I2C/SPI are expensive) to hardware that really doesn't need it
because address learning on the CPU port works there.
Reported-by: DENG Qingfang <dqfext@gmail.com>
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
include/net/dsa.h | 5 +++++
net/dsa/slave.c | 66 +++++++++++++++++++++++++++++++++++++++++++++----------
2 files changed, 60 insertions(+), 11 deletions(-)
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -317,6 +317,11 @@ struct dsa_switch {
*/
bool untag_bridge_pvid;
+ /* Let DSA manage the FDB entries towards the CPU, based on the
+ * software bridge database.
+ */
+ bool assisted_learning_on_cpu_port;
+
/* In case vlan_filtering_is_global is set, the VLAN awareness state
* should be retrieved from here and not from the per-port settings.
*/
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -2149,6 +2149,28 @@ static void dsa_slave_switchdev_event_wo
dev_put(dp->slave);
}
+static int dsa_lower_dev_walk(struct net_device *lower_dev,
+ struct netdev_nested_priv *priv)
+{
+ if (dsa_slave_dev_check(lower_dev)) {
+ priv->data = (void *)netdev_priv(lower_dev);
+ return 1;
+ }
+
+ return 0;
+}
+
+static struct dsa_slave_priv *dsa_slave_dev_lower_find(struct net_device *dev)
+{
+ struct netdev_nested_priv priv = {
+ .data = NULL,
+ };
+
+ netdev_walk_all_lower_dev_rcu(dev, dsa_lower_dev_walk, &priv);
+
+ return (struct dsa_slave_priv *)priv.data;
+}
+
/* Called under rcu_read_lock() */
static int dsa_slave_switchdev_event(struct notifier_block *unused,
unsigned long event, void *ptr)
@@ -2167,10 +2189,37 @@ static int dsa_slave_switchdev_event(str
return notifier_from_errno(err);
case SWITCHDEV_FDB_ADD_TO_DEVICE:
case SWITCHDEV_FDB_DEL_TO_DEVICE:
- if (!dsa_slave_dev_check(dev))
- return NOTIFY_DONE;
+ fdb_info = ptr;
+
+ if (dsa_slave_dev_check(dev)) {
+ if (!fdb_info->added_by_user)
+ return NOTIFY_OK;
+
+ dp = dsa_slave_to_port(dev);
+ } else {
+ /* Snoop addresses learnt on foreign interfaces
+ * bridged with us, for switches that don't
+ * automatically learn SA from CPU-injected traffic
+ */
+ struct net_device *br_dev;
+ struct dsa_slave_priv *p;
+
+ br_dev = netdev_master_upper_dev_get_rcu(dev);
+ if (!br_dev)
+ return NOTIFY_DONE;
+
+ if (!netif_is_bridge_master(br_dev))
+ return NOTIFY_DONE;
+
+ p = dsa_slave_dev_lower_find(br_dev);
+ if (!p)
+ return NOTIFY_DONE;
- dp = dsa_slave_to_port(dev);
+ dp = p->dp->cpu_dp;
+
+ if (!dp->ds->assisted_learning_on_cpu_port)
+ return NOTIFY_DONE;
+ }
if (!dp->ds->ops->port_fdb_add || !dp->ds->ops->port_fdb_del)
return NOTIFY_DONE;
@@ -2185,18 +2234,13 @@ static int dsa_slave_switchdev_event(str
switchdev_work->port = dp->index;
switchdev_work->event = event;
- fdb_info = ptr;
-
- if (!fdb_info->added_by_user) {
- kfree(switchdev_work);
- return NOTIFY_OK;
- }
-
ether_addr_copy(switchdev_work->addr,
fdb_info->addr);
switchdev_work->vid = fdb_info->vid;
- dev_hold(dev);
+ /* Hold a reference on the slave for dsa_fdb_offload_notify */
+ if (dsa_is_user_port(dp->ds, dp->index))
+ dev_hold(dev);
dsa_schedule_work(&switchdev_work->work);
break;
default:

View File

@ -1,84 +0,0 @@
From c3b8e07909dbe67b0d580416c1a5257643a73be7 Mon Sep 17 00:00:00 2001
From: Ilya Lipnitskiy <ilya.lipnitskiy@gmail.com>
Date: Fri, 12 Mar 2021 00:07:03 -0800
Subject: [PATCH] net: dsa: mt7530: setup core clock even in TRGMII mode
A recent change to MIPS ralink reset logic made it so mt7530 actually
resets the switch on platforms such as mt7621 (where bit 2 is the reset
line for the switch). That exposed an issue where the switch would not
function properly in TRGMII mode after a reset.
Reconfigure core clock in TRGMII mode to fix the issue.
Tested on Ubiquiti ER-X (MT7621) with TRGMII mode enabled.
Fixes: 3f9ef7785a9c ("MIPS: ralink: manage low reset lines")
Signed-off-by: Ilya Lipnitskiy <ilya.lipnitskiy@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
drivers/net/dsa/mt7530.c | 52 +++++++++++++++++++---------------------
1 file changed, 25 insertions(+), 27 deletions(-)
--- a/drivers/net/dsa/mt7530.c
+++ b/drivers/net/dsa/mt7530.c
@@ -436,34 +436,32 @@ mt7530_pad_clk_setup(struct dsa_switch *
TD_DM_DRVP(8) | TD_DM_DRVN(8));
/* Setup core clock for MT7530 */
- if (!trgint) {
- /* Disable MT7530 core clock */
- core_clear(priv, CORE_TRGMII_GSW_CLK_CG, REG_GSWCK_EN);
-
- /* Disable PLL, since phy_device has not yet been created
- * provided for phy_[read,write]_mmd_indirect is called, we
- * provide our own core_write_mmd_indirect to complete this
- * function.
- */
- core_write_mmd_indirect(priv,
- CORE_GSWPLL_GRP1,
- MDIO_MMD_VEND2,
- 0);
-
- /* Set core clock into 500Mhz */
- core_write(priv, CORE_GSWPLL_GRP2,
- RG_GSWPLL_POSDIV_500M(1) |
- RG_GSWPLL_FBKDIV_500M(25));
-
- /* Enable PLL */
- core_write(priv, CORE_GSWPLL_GRP1,
- RG_GSWPLL_EN_PRE |
- RG_GSWPLL_POSDIV_200M(2) |
- RG_GSWPLL_FBKDIV_200M(32));
-
- /* Enable MT7530 core clock */
- core_set(priv, CORE_TRGMII_GSW_CLK_CG, REG_GSWCK_EN);
- }
+ /* Disable MT7530 core clock */
+ core_clear(priv, CORE_TRGMII_GSW_CLK_CG, REG_GSWCK_EN);
+
+ /* Disable PLL, since phy_device has not yet been created
+ * provided for phy_[read,write]_mmd_indirect is called, we
+ * provide our own core_write_mmd_indirect to complete this
+ * function.
+ */
+ core_write_mmd_indirect(priv,
+ CORE_GSWPLL_GRP1,
+ MDIO_MMD_VEND2,
+ 0);
+
+ /* Set core clock into 500Mhz */
+ core_write(priv, CORE_GSWPLL_GRP2,
+ RG_GSWPLL_POSDIV_500M(1) |
+ RG_GSWPLL_FBKDIV_500M(25));
+
+ /* Enable PLL */
+ core_write(priv, CORE_GSWPLL_GRP1,
+ RG_GSWPLL_EN_PRE |
+ RG_GSWPLL_POSDIV_200M(2) |
+ RG_GSWPLL_FBKDIV_200M(32));
+
+ /* Enable MT7530 core clock */
+ core_set(priv, CORE_TRGMII_GSW_CLK_CG, REG_GSWCK_EN);
/* Setup the MT7530 TRGMII Tx Clock */
core_set(priv, CORE_TRGMII_GSW_CLK_CG, REG_GSWCK_EN);

View File

@ -1,181 +0,0 @@
From 429a0edeefd88cbfca5c417dfb8561047bb50769 Mon Sep 17 00:00:00 2001
From: DENG Qingfang <dqfext@gmail.com>
Date: Mon, 25 Jan 2021 12:43:22 +0800
Subject: [PATCH] net: dsa: mt7530: MT7530 optional GPIO support
MT7530's LED controller can drive up to 15 LED/GPIOs.
Add support for GPIO control and allow users to use its GPIOs by
setting gpio-controller property in device tree.
Signed-off-by: DENG Qingfang <dqfext@gmail.com>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
drivers/net/dsa/mt7530.c | 110 +++++++++++++++++++++++++++++++++++++++
drivers/net/dsa/mt7530.h | 20 +++++++
2 files changed, 130 insertions(+)
--- a/drivers/net/dsa/mt7530.c
+++ b/drivers/net/dsa/mt7530.c
@@ -18,6 +18,7 @@
#include <linux/regulator/consumer.h>
#include <linux/reset.h>
#include <linux/gpio/consumer.h>
+#include <linux/gpio/driver.h>
#include <net/dsa.h>
#include "mt7530.h"
@@ -1534,6 +1535,109 @@ mtk_get_tag_protocol(struct dsa_switch *
}
}
+static inline u32
+mt7530_gpio_to_bit(unsigned int offset)
+{
+ /* Map GPIO offset to register bit
+ * [ 2: 0] port 0 LED 0..2 as GPIO 0..2
+ * [ 6: 4] port 1 LED 0..2 as GPIO 3..5
+ * [10: 8] port 2 LED 0..2 as GPIO 6..8
+ * [14:12] port 3 LED 0..2 as GPIO 9..11
+ * [18:16] port 4 LED 0..2 as GPIO 12..14
+ */
+ return BIT(offset + offset / 3);
+}
+
+static int
+mt7530_gpio_get(struct gpio_chip *gc, unsigned int offset)
+{
+ struct mt7530_priv *priv = gpiochip_get_data(gc);
+ u32 bit = mt7530_gpio_to_bit(offset);
+
+ return !!(mt7530_read(priv, MT7530_LED_GPIO_DATA) & bit);
+}
+
+static void
+mt7530_gpio_set(struct gpio_chip *gc, unsigned int offset, int value)
+{
+ struct mt7530_priv *priv = gpiochip_get_data(gc);
+ u32 bit = mt7530_gpio_to_bit(offset);
+
+ if (value)
+ mt7530_set(priv, MT7530_LED_GPIO_DATA, bit);
+ else
+ mt7530_clear(priv, MT7530_LED_GPIO_DATA, bit);
+}
+
+static int
+mt7530_gpio_get_direction(struct gpio_chip *gc, unsigned int offset)
+{
+ struct mt7530_priv *priv = gpiochip_get_data(gc);
+ u32 bit = mt7530_gpio_to_bit(offset);
+
+ return (mt7530_read(priv, MT7530_LED_GPIO_DIR) & bit) ?
+ GPIO_LINE_DIRECTION_OUT : GPIO_LINE_DIRECTION_IN;
+}
+
+static int
+mt7530_gpio_direction_input(struct gpio_chip *gc, unsigned int offset)
+{
+ struct mt7530_priv *priv = gpiochip_get_data(gc);
+ u32 bit = mt7530_gpio_to_bit(offset);
+
+ mt7530_clear(priv, MT7530_LED_GPIO_OE, bit);
+ mt7530_clear(priv, MT7530_LED_GPIO_DIR, bit);
+
+ return 0;
+}
+
+static int
+mt7530_gpio_direction_output(struct gpio_chip *gc, unsigned int offset, int value)
+{
+ struct mt7530_priv *priv = gpiochip_get_data(gc);
+ u32 bit = mt7530_gpio_to_bit(offset);
+
+ mt7530_set(priv, MT7530_LED_GPIO_DIR, bit);
+
+ if (value)
+ mt7530_set(priv, MT7530_LED_GPIO_DATA, bit);
+ else
+ mt7530_clear(priv, MT7530_LED_GPIO_DATA, bit);
+
+ mt7530_set(priv, MT7530_LED_GPIO_OE, bit);
+
+ return 0;
+}
+
+static int
+mt7530_setup_gpio(struct mt7530_priv *priv)
+{
+ struct device *dev = priv->dev;
+ struct gpio_chip *gc;
+
+ gc = devm_kzalloc(dev, sizeof(*gc), GFP_KERNEL);
+ if (!gc)
+ return -ENOMEM;
+
+ mt7530_write(priv, MT7530_LED_GPIO_OE, 0);
+ mt7530_write(priv, MT7530_LED_GPIO_DIR, 0);
+ mt7530_write(priv, MT7530_LED_IO_MODE, 0);
+
+ gc->label = "mt7530";
+ gc->parent = dev;
+ gc->owner = THIS_MODULE;
+ gc->get_direction = mt7530_gpio_get_direction;
+ gc->direction_input = mt7530_gpio_direction_input;
+ gc->direction_output = mt7530_gpio_direction_output;
+ gc->get = mt7530_gpio_get;
+ gc->set = mt7530_gpio_set;
+ gc->base = -1;
+ gc->ngpio = 15;
+ gc->can_sleep = true;
+
+ return devm_gpiochip_add_data(dev, gc, priv);
+}
+
static int
mt7530_setup(struct dsa_switch *ds)
{
@@ -1675,6 +1779,12 @@ mt7530_setup(struct dsa_switch *ds)
}
}
+ if (of_property_read_bool(priv->dev->of_node, "gpio-controller")) {
+ ret = mt7530_setup_gpio(priv);
+ if (ret)
+ return ret;
+ }
+
mt7530_setup_port5(ds, interface);
/* Flush the FDB table */
--- a/drivers/net/dsa/mt7530.h
+++ b/drivers/net/dsa/mt7530.h
@@ -529,6 +529,26 @@ enum mt7531_clk_skew {
#define MT7531_GPIO12_RG_RXD3_MASK GENMASK(19, 16)
#define MT7531_EXT_P_MDIO_12 (2 << 16)
+/* Registers for LED GPIO control (MT7530 only)
+ * All registers follow this pattern:
+ * [ 2: 0] port 0
+ * [ 6: 4] port 1
+ * [10: 8] port 2
+ * [14:12] port 3
+ * [18:16] port 4
+ */
+
+/* LED enable, 0: Disable, 1: Enable (Default) */
+#define MT7530_LED_EN 0x7d00
+/* LED mode, 0: GPIO mode, 1: PHY mode (Default) */
+#define MT7530_LED_IO_MODE 0x7d04
+/* GPIO direction, 0: Input, 1: Output */
+#define MT7530_LED_GPIO_DIR 0x7d10
+/* GPIO output enable, 0: Disable, 1: Enable */
+#define MT7530_LED_GPIO_OE 0x7d14
+/* GPIO value, 0: Low, 1: High */
+#define MT7530_LED_GPIO_DATA 0x7d18
+
#define MT7530_CREV 0x7ffc
#define CHIP_NAME_SHIFT 16
#define MT7530_ID 0x7530

Some files were not shown because too many files have changed in this diff Show More