mirror of
https://github.com/openwrt/openwrt.git
synced 2025-02-22 10:01:03 +00:00
generic: copy backport, hack, pending patch and config from 5.15 to 6.1
Copy backport, hack, pending patch and config from 5.15 to 6.1. Signed-off-by: Christian Marangi <ansuelsmth@gmail.com>
This commit is contained in:
parent
8fb9bbcf65
commit
fa79baf4a6
@ -0,0 +1,73 @@
|
||||
From 2fd7e7f9317d3048a14026816d081b08ba98ea8e Mon Sep 17 00:00:00 2001
|
||||
From: Mark Rutland <mark.rutland@arm.com>
|
||||
Date: Tue, 8 Mar 2022 22:56:13 +0100
|
||||
Subject: [PATCH 1/3] Kbuild: use -Wdeclaration-after-statement
|
||||
|
||||
The kernel is moving from using `-std=gnu89` to `-std=gnu11`, permitting
|
||||
the use of additional C11 features such as for-loop initial declarations.
|
||||
|
||||
One contentious aspect of C99 is that it permits mixed declarations and
|
||||
code, and for now at least, it seems preferable to enforce that
|
||||
declarations must come first.
|
||||
|
||||
These warnings were already enabled in the kernel itself, but not
|
||||
for KBUILD_USERCFLAGS or the compat VDSO on arch/arm64, which uses
|
||||
a separate set of CFLAGS.
|
||||
|
||||
This patch fixes an existing violation in modpost.c, which is not
|
||||
reported because of the missing flag in KBUILD_USERCFLAGS:
|
||||
|
||||
| scripts/mod/modpost.c: In function ‘match’:
|
||||
| scripts/mod/modpost.c:837:3: warning: ISO C90 forbids mixed declarations and code [-Wdeclaration-after-statement]
|
||||
| 837 | const char *endp = p + strlen(p) - 1;
|
||||
| | ^~~~~
|
||||
|
||||
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
|
||||
[arnd: don't add a duplicate flag to the default set, update changelog]
|
||||
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
|
||||
Reviewed-by: Nathan Chancellor <nathan@kernel.org>
|
||||
Reviewed-by: Nick Desaulniers <ndesaulniers@google.com>
|
||||
Tested-by: Sedat Dilek <sedat.dilek@gmail.com> # LLVM/Clang v13.0.0 (x86-64)
|
||||
Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
|
||||
---
|
||||
Makefile | 3 ++-
|
||||
arch/arm64/kernel/vdso32/Makefile | 1 +
|
||||
scripts/mod/modpost.c | 4 +++-
|
||||
3 files changed, 6 insertions(+), 2 deletions(-)
|
||||
|
||||
--- a/Makefile
|
||||
+++ b/Makefile
|
||||
@@ -440,7 +440,8 @@ endif
|
||||
HOSTPKG_CONFIG = pkg-config
|
||||
|
||||
export KBUILD_USERCFLAGS := -Wall -Wmissing-prototypes -Wstrict-prototypes \
|
||||
- -O2 -fomit-frame-pointer -std=gnu89
|
||||
+ -O2 -fomit-frame-pointer -std=gnu89 \
|
||||
+ -Wdeclaration-after-statement
|
||||
export KBUILD_USERLDFLAGS :=
|
||||
|
||||
KBUILD_HOSTCFLAGS := $(KBUILD_USERCFLAGS) $(HOST_LFS_CFLAGS) $(HOSTCFLAGS)
|
||||
--- a/arch/arm64/kernel/vdso32/Makefile
|
||||
+++ b/arch/arm64/kernel/vdso32/Makefile
|
||||
@@ -76,6 +76,7 @@ VDSO_CFLAGS += -Wall -Wundef -Wstrict-pr
|
||||
-fno-strict-aliasing -fno-common \
|
||||
-Werror-implicit-function-declaration \
|
||||
-Wno-format-security \
|
||||
+ -Wdeclaration-after-statement \
|
||||
-std=gnu89
|
||||
VDSO_CFLAGS += -O2
|
||||
# Some useful compiler-dependent flags from top-level Makefile
|
||||
--- a/scripts/mod/modpost.c
|
||||
+++ b/scripts/mod/modpost.c
|
||||
@@ -833,8 +833,10 @@ static int match(const char *sym, const
|
||||
{
|
||||
const char *p;
|
||||
while (*pat) {
|
||||
+ const char *endp;
|
||||
+
|
||||
p = *pat++;
|
||||
- const char *endp = p + strlen(p) - 1;
|
||||
+ endp = p + strlen(p) - 1;
|
||||
|
||||
/* "*foo*" */
|
||||
if (*p == '*' && *endp == '*') {
|
@ -0,0 +1,60 @@
|
||||
From b810c8e719ea082e47c7a8f7cf878bc84fa2455d Mon Sep 17 00:00:00 2001
|
||||
From: Arnd Bergmann <arnd@arndb.de>
|
||||
Date: Tue, 8 Mar 2022 22:56:14 +0100
|
||||
Subject: [PATCH 2/3] Kbuild: move to -std=gnu11
|
||||
|
||||
During a patch discussion, Linus brought up the option of changing
|
||||
the C standard version from gnu89 to gnu99, which allows using variable
|
||||
declaration inside of a for() loop. While the C99, C11 and later standards
|
||||
introduce many other features, most of these are already available in
|
||||
gnu89 as GNU extensions as well.
|
||||
|
||||
An earlier attempt to do this when gcc-5 started defaulting to
|
||||
-std=gnu11 failed because at the time that caused warnings about
|
||||
designated initializers with older compilers. Now that gcc-5.1 is
|
||||
the minimum compiler version used for building kernels, that is no
|
||||
longer a concern. Similarly, the behavior of 'inline' functions changes
|
||||
between gnu89 using gnu_inline behavior and gnu11 using standard c99+
|
||||
behavior, but this was taken care of by defining 'inline' to include
|
||||
__attribute__((gnu_inline)) in order to allow building with clang a
|
||||
while ago.
|
||||
|
||||
Nathan Chancellor reported a new -Wdeclaration-after-statement
|
||||
warning that appears in a system header on arm, this still needs a
|
||||
workaround.
|
||||
|
||||
The differences between gnu99, gnu11, gnu1x and gnu17 are fairly
|
||||
minimal and mainly impact warnings at the -Wpedantic level that the
|
||||
kernel never enables. Between these, gnu11 is the newest version
|
||||
that is supported by all supported compiler versions, though it is
|
||||
only the default on gcc-5, while all other supported versions of
|
||||
gcc or clang default to gnu1x/gnu17.
|
||||
|
||||
Link: https://lore.kernel.org/lkml/CAHk-=wiyCH7xeHcmiFJ-YgXUy2Jaj7pnkdKpcovt8fYbVFW3TA@mail.gmail.com/
|
||||
Link: https://github.com/ClangBuiltLinux/linux/issues/1603
|
||||
Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
|
||||
Acked-by: Marco Elver <elver@google.com>
|
||||
Acked-by: Jani Nikula <jani.nikula@intel.com>
|
||||
Acked-by: David Sterba <dsterba@suse.com>
|
||||
Tested-by: Sedat Dilek <sedat.dilek@gmail.com>
|
||||
Reviewed-by: Alex Shi <alexs@kernel.org>
|
||||
Reviewed-by: Nick Desaulniers <ndesaulniers@google.com>
|
||||
Reviewed-by: Miguel Ojeda <ojeda@kernel.org>
|
||||
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
|
||||
Reviewed-by: Nathan Chancellor <nathan@kernel.org>
|
||||
Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
|
||||
---
|
||||
Makefile | 2 +-
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
--- a/Makefile
|
||||
+++ b/Makefile
|
||||
@@ -524,7 +524,7 @@ KBUILD_CFLAGS := -Wall -Wundef -Werror
|
||||
-fno-strict-aliasing -fno-common -fshort-wchar -fno-PIE \
|
||||
-Werror=implicit-function-declaration -Werror=implicit-int \
|
||||
-Werror=return-type -Wno-format-security \
|
||||
- -std=gnu89
|
||||
+ -std=gnu11
|
||||
KBUILD_CPPFLAGS := -D__KERNEL__
|
||||
KBUILD_AFLAGS_KERNEL :=
|
||||
KBUILD_CFLAGS_KERNEL :=
|
@ -0,0 +1,43 @@
|
||||
From 40337d6f3d677aee7ad3052ae662d3f53dd4d5cb Mon Sep 17 00:00:00 2001
|
||||
From: Arnd Bergmann <arnd@arndb.de>
|
||||
Date: Tue, 8 Mar 2022 22:56:15 +0100
|
||||
Subject: [PATCH 3/3] Kbuild: use -std=gnu11 for KBUILD_USERCFLAGS
|
||||
|
||||
As we change the C language standard for the kernel from gnu89 to
|
||||
gnu11, it makes sense to also update the version for user space
|
||||
compilation.
|
||||
|
||||
Some users have older native compilers than what they use for
|
||||
kernel builds, so I considered using gnu99 as the default version
|
||||
for wider compatibility with gcc-4.6 and earlier.
|
||||
|
||||
However, testing with older compilers showed that we already require
|
||||
HOSTCC version 5.1 as well because a lot of host tools include
|
||||
linux/compiler.h that uses __has_attribute():
|
||||
|
||||
CC tools/objtool/exec-cmd.o
|
||||
In file included from tools/include/linux/compiler_types.h:36:0,
|
||||
from tools/include/linux/compiler.h:5,
|
||||
from exec-cmd.c:2:
|
||||
tools/include/linux/compiler-gcc.h:19:5: error: "__has_attribute" is not defined [-Werror=undef]
|
||||
|
||||
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
|
||||
Reviewed-by: Nathan Chancellor <nathan@kernel.org>
|
||||
Reviewed-by: Nick Desaulniers <ndesaulniers@google.com>
|
||||
Tested-by: Sedat Dilek <sedat.dilek@gmail.com>
|
||||
Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
|
||||
---
|
||||
Makefile | 2 +-
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
--- a/Makefile
|
||||
+++ b/Makefile
|
||||
@@ -440,7 +440,7 @@ endif
|
||||
HOSTPKG_CONFIG = pkg-config
|
||||
|
||||
export KBUILD_USERCFLAGS := -Wall -Wmissing-prototypes -Wstrict-prototypes \
|
||||
- -O2 -fomit-frame-pointer -std=gnu89 \
|
||||
+ -O2 -fomit-frame-pointer -std=gnu11 \
|
||||
-Wdeclaration-after-statement
|
||||
export KBUILD_USERLDFLAGS :=
|
||||
|
@ -0,0 +1,425 @@
|
||||
From a4103262b01a1b8704b37c01c7c813df91b7b119 Mon Sep 17 00:00:00 2001
|
||||
From: Yu Zhao <yuzhao@google.com>
|
||||
Date: Sun, 18 Sep 2022 01:59:58 -0600
|
||||
Subject: [PATCH 01/29] mm: x86, arm64: add arch_has_hw_pte_young()
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
Patch series "Multi-Gen LRU Framework", v14.
|
||||
|
||||
What's new
|
||||
==========
|
||||
1. OpenWrt, in addition to Android, Arch Linux Zen, Armbian, ChromeOS,
|
||||
Liquorix, post-factum and XanMod, is now shipping MGLRU on 5.15.
|
||||
2. Fixed long-tailed direct reclaim latency seen on high-memory (TBs)
|
||||
machines. The old direct reclaim backoff, which tries to enforce a
|
||||
minimum fairness among all eligible memcgs, over-swapped by about
|
||||
(total_mem>>DEF_PRIORITY)-nr_to_reclaim. The new backoff, which
|
||||
pulls the plug on swapping once the target is met, trades some
|
||||
fairness for curtailed latency:
|
||||
https://lore.kernel.org/r/20220918080010.2920238-10-yuzhao@google.com/
|
||||
3. Fixed minior build warnings and conflicts. More comments and nits.
|
||||
|
||||
TLDR
|
||||
====
|
||||
The current page reclaim is too expensive in terms of CPU usage and it
|
||||
often makes poor choices about what to evict. This patchset offers an
|
||||
alternative solution that is performant, versatile and
|
||||
straightforward.
|
||||
|
||||
Patchset overview
|
||||
=================
|
||||
The design and implementation overview is in patch 14:
|
||||
https://lore.kernel.org/r/20220918080010.2920238-15-yuzhao@google.com/
|
||||
|
||||
01. mm: x86, arm64: add arch_has_hw_pte_young()
|
||||
02. mm: x86: add CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG
|
||||
Take advantage of hardware features when trying to clear the accessed
|
||||
bit in many PTEs.
|
||||
|
||||
03. mm/vmscan.c: refactor shrink_node()
|
||||
04. Revert "include/linux/mm_inline.h: fold __update_lru_size() into
|
||||
its sole caller"
|
||||
Minor refactors to improve readability for the following patches.
|
||||
|
||||
05. mm: multi-gen LRU: groundwork
|
||||
Adds the basic data structure and the functions that insert pages to
|
||||
and remove pages from the multi-gen LRU (MGLRU) lists.
|
||||
|
||||
06. mm: multi-gen LRU: minimal implementation
|
||||
A minimal implementation without optimizations.
|
||||
|
||||
07. mm: multi-gen LRU: exploit locality in rmap
|
||||
Exploits spatial locality to improve efficiency when using the rmap.
|
||||
|
||||
08. mm: multi-gen LRU: support page table walks
|
||||
Further exploits spatial locality by optionally scanning page tables.
|
||||
|
||||
09. mm: multi-gen LRU: optimize multiple memcgs
|
||||
Optimizes the overall performance for multiple memcgs running mixed
|
||||
types of workloads.
|
||||
|
||||
10. mm: multi-gen LRU: kill switch
|
||||
Adds a kill switch to enable or disable MGLRU at runtime.
|
||||
|
||||
11. mm: multi-gen LRU: thrashing prevention
|
||||
12. mm: multi-gen LRU: debugfs interface
|
||||
Provide userspace with features like thrashing prevention, working set
|
||||
estimation and proactive reclaim.
|
||||
|
||||
13. mm: multi-gen LRU: admin guide
|
||||
14. mm: multi-gen LRU: design doc
|
||||
Add an admin guide and a design doc.
|
||||
|
||||
Benchmark results
|
||||
=================
|
||||
Independent lab results
|
||||
-----------------------
|
||||
Based on the popularity of searches [01] and the memory usage in
|
||||
Google's public cloud, the most popular open-source memory-hungry
|
||||
applications, in alphabetical order, are:
|
||||
Apache Cassandra Memcached
|
||||
Apache Hadoop MongoDB
|
||||
Apache Spark PostgreSQL
|
||||
MariaDB (MySQL) Redis
|
||||
|
||||
An independent lab evaluated MGLRU with the most widely used benchmark
|
||||
suites for the above applications. They posted 960 data points along
|
||||
with kernel metrics and perf profiles collected over more than 500
|
||||
hours of total benchmark time. Their final reports show that, with 95%
|
||||
confidence intervals (CIs), the above applications all performed
|
||||
significantly better for at least part of their benchmark matrices.
|
||||
|
||||
On 5.14:
|
||||
1. Apache Spark [02] took 95% CIs [9.28, 11.19]% and [12.20, 14.93]%
|
||||
less wall time to sort three billion random integers, respectively,
|
||||
under the medium- and the high-concurrency conditions, when
|
||||
overcommitting memory. There were no statistically significant
|
||||
changes in wall time for the rest of the benchmark matrix.
|
||||
2. MariaDB [03] achieved 95% CIs [5.24, 10.71]% and [20.22, 25.97]%
|
||||
more transactions per minute (TPM), respectively, under the medium-
|
||||
and the high-concurrency conditions, when overcommitting memory.
|
||||
There were no statistically significant changes in TPM for the rest
|
||||
of the benchmark matrix.
|
||||
3. Memcached [04] achieved 95% CIs [23.54, 32.25]%, [20.76, 41.61]%
|
||||
and [21.59, 30.02]% more operations per second (OPS), respectively,
|
||||
for sequential access, random access and Gaussian (distribution)
|
||||
access, when THP=always; 95% CIs [13.85, 15.97]% and
|
||||
[23.94, 29.92]% more OPS, respectively, for random access and
|
||||
Gaussian access, when THP=never. There were no statistically
|
||||
significant changes in OPS for the rest of the benchmark matrix.
|
||||
4. MongoDB [05] achieved 95% CIs [2.23, 3.44]%, [6.97, 9.73]% and
|
||||
[2.16, 3.55]% more operations per second (OPS), respectively, for
|
||||
exponential (distribution) access, random access and Zipfian
|
||||
(distribution) access, when underutilizing memory; 95% CIs
|
||||
[8.83, 10.03]%, [21.12, 23.14]% and [5.53, 6.46]% more OPS,
|
||||
respectively, for exponential access, random access and Zipfian
|
||||
access, when overcommitting memory.
|
||||
|
||||
On 5.15:
|
||||
5. Apache Cassandra [06] achieved 95% CIs [1.06, 4.10]%, [1.94, 5.43]%
|
||||
and [4.11, 7.50]% more operations per second (OPS), respectively,
|
||||
for exponential (distribution) access, random access and Zipfian
|
||||
(distribution) access, when swap was off; 95% CIs [0.50, 2.60]%,
|
||||
[6.51, 8.77]% and [3.29, 6.75]% more OPS, respectively, for
|
||||
exponential access, random access and Zipfian access, when swap was
|
||||
on.
|
||||
6. Apache Hadoop [07] took 95% CIs [5.31, 9.69]% and [2.02, 7.86]%
|
||||
less average wall time to finish twelve parallel TeraSort jobs,
|
||||
respectively, under the medium- and the high-concurrency
|
||||
conditions, when swap was on. There were no statistically
|
||||
significant changes in average wall time for the rest of the
|
||||
benchmark matrix.
|
||||
7. PostgreSQL [08] achieved 95% CI [1.75, 6.42]% more transactions per
|
||||
minute (TPM) under the high-concurrency condition, when swap was
|
||||
off; 95% CIs [12.82, 18.69]% and [22.70, 46.86]% more TPM,
|
||||
respectively, under the medium- and the high-concurrency
|
||||
conditions, when swap was on. There were no statistically
|
||||
significant changes in TPM for the rest of the benchmark matrix.
|
||||
8. Redis [09] achieved 95% CIs [0.58, 5.94]%, [6.55, 14.58]% and
|
||||
[11.47, 19.36]% more total operations per second (OPS),
|
||||
respectively, for sequential access, random access and Gaussian
|
||||
(distribution) access, when THP=always; 95% CIs [1.27, 3.54]%,
|
||||
[10.11, 14.81]% and [8.75, 13.64]% more total OPS, respectively,
|
||||
for sequential access, random access and Gaussian access, when
|
||||
THP=never.
|
||||
|
||||
Our lab results
|
||||
---------------
|
||||
To supplement the above results, we ran the following benchmark suites
|
||||
on 5.16-rc7 and found no regressions [10].
|
||||
fs_fio_bench_hdd_mq pft
|
||||
fs_lmbench pgsql-hammerdb
|
||||
fs_parallelio redis
|
||||
fs_postmark stream
|
||||
hackbench sysbenchthread
|
||||
kernbench tpcc_spark
|
||||
memcached unixbench
|
||||
multichase vm-scalability
|
||||
mutilate will-it-scale
|
||||
nginx
|
||||
|
||||
[01] https://trends.google.com
|
||||
[02] https://lore.kernel.org/r/20211102002002.92051-1-bot@edi.works/
|
||||
[03] https://lore.kernel.org/r/20211009054315.47073-1-bot@edi.works/
|
||||
[04] https://lore.kernel.org/r/20211021194103.65648-1-bot@edi.works/
|
||||
[05] https://lore.kernel.org/r/20211109021346.50266-1-bot@edi.works/
|
||||
[06] https://lore.kernel.org/r/20211202062806.80365-1-bot@edi.works/
|
||||
[07] https://lore.kernel.org/r/20211209072416.33606-1-bot@edi.works/
|
||||
[08] https://lore.kernel.org/r/20211218071041.24077-1-bot@edi.works/
|
||||
[09] https://lore.kernel.org/r/20211122053248.57311-1-bot@edi.works/
|
||||
[10] https://lore.kernel.org/r/20220104202247.2903702-1-yuzhao@google.com/
|
||||
|
||||
Read-world applications
|
||||
=======================
|
||||
Third-party testimonials
|
||||
------------------------
|
||||
Konstantin reported [11]:
|
||||
I have Archlinux with 8G RAM + zswap + swap. While developing, I
|
||||
have lots of apps opened such as multiple LSP-servers for different
|
||||
langs, chats, two browsers, etc... Usually, my system gets quickly
|
||||
to a point of SWAP-storms, where I have to kill LSP-servers,
|
||||
restart browsers to free memory, etc, otherwise the system lags
|
||||
heavily and is barely usable.
|
||||
|
||||
1.5 day ago I migrated from 5.11.15 kernel to 5.12 + the LRU
|
||||
patchset, and I started up by opening lots of apps to create memory
|
||||
pressure, and worked for a day like this. Till now I had not a
|
||||
single SWAP-storm, and mind you I got 3.4G in SWAP. I was never
|
||||
getting to the point of 3G in SWAP before without a single
|
||||
SWAP-storm.
|
||||
|
||||
Vaibhav from IBM reported [12]:
|
||||
In a synthetic MongoDB Benchmark, seeing an average of ~19%
|
||||
throughput improvement on POWER10(Radix MMU + 64K Page Size) with
|
||||
MGLRU patches on top of 5.16 kernel for MongoDB + YCSB across
|
||||
three different request distributions, namely, Exponential, Uniform
|
||||
and Zipfan.
|
||||
|
||||
Shuang from U of Rochester reported [13]:
|
||||
With the MGLRU, fio achieved 95% CIs [38.95, 40.26]%, [4.12, 6.64]%
|
||||
and [9.26, 10.36]% higher throughput, respectively, for random
|
||||
access, Zipfian (distribution) access and Gaussian (distribution)
|
||||
access, when the average number of jobs per CPU is 1; 95% CIs
|
||||
[42.32, 49.15]%, [9.44, 9.89]% and [20.99, 22.86]% higher
|
||||
throughput, respectively, for random access, Zipfian access and
|
||||
Gaussian access, when the average number of jobs per CPU is 2.
|
||||
|
||||
Daniel from Michigan Tech reported [14]:
|
||||
With Memcached allocating ~100GB of byte-addressable Optante,
|
||||
performance improvement in terms of throughput (measured as queries
|
||||
per second) was about 10% for a series of workloads.
|
||||
|
||||
Large-scale deployments
|
||||
-----------------------
|
||||
We've rolled out MGLRU to tens of millions of ChromeOS users and
|
||||
about a million Android users. Google's fleetwide profiling [15] shows
|
||||
an overall 40% decrease in kswapd CPU usage, in addition to
|
||||
improvements in other UX metrics, e.g., an 85% decrease in the number
|
||||
of low-memory kills at the 75th percentile and an 18% decrease in
|
||||
app launch time at the 50th percentile.
|
||||
|
||||
The downstream kernels that have been using MGLRU include:
|
||||
1. Android [16]
|
||||
2. Arch Linux Zen [17]
|
||||
3. Armbian [18]
|
||||
4. ChromeOS [19]
|
||||
5. Liquorix [20]
|
||||
6. OpenWrt [21]
|
||||
7. post-factum [22]
|
||||
8. XanMod [23]
|
||||
|
||||
[11] https://lore.kernel.org/r/140226722f2032c86301fbd326d91baefe3d7d23.camel@yandex.ru/
|
||||
[12] https://lore.kernel.org/r/87czj3mux0.fsf@vajain21.in.ibm.com/
|
||||
[13] https://lore.kernel.org/r/20220105024423.26409-1-szhai2@cs.rochester.edu/
|
||||
[14] https://lore.kernel.org/r/CA+4-3vksGvKd18FgRinxhqHetBS1hQekJE2gwco8Ja-bJWKtFw@mail.gmail.com/
|
||||
[15] https://dl.acm.org/doi/10.1145/2749469.2750392
|
||||
[16] https://android.com
|
||||
[17] https://archlinux.org
|
||||
[18] https://armbian.com
|
||||
[19] https://chromium.org
|
||||
[20] https://liquorix.net
|
||||
[21] https://openwrt.org
|
||||
[22] https://codeberg.org/pf-kernel
|
||||
[23] https://xanmod.org
|
||||
|
||||
Summary
|
||||
=======
|
||||
The facts are:
|
||||
1. The independent lab results and the real-world applications
|
||||
indicate substantial improvements; there are no known regressions.
|
||||
2. Thrashing prevention, working set estimation and proactive reclaim
|
||||
work out of the box; there are no equivalent solutions.
|
||||
3. There is a lot of new code; no smaller changes have been
|
||||
demonstrated similar effects.
|
||||
|
||||
Our options, accordingly, are:
|
||||
1. Given the amount of evidence, the reported improvements will likely
|
||||
materialize for a wide range of workloads.
|
||||
2. Gauging the interest from the past discussions, the new features
|
||||
will likely be put to use for both personal computers and data
|
||||
centers.
|
||||
3. Based on Google's track record, the new code will likely be well
|
||||
maintained in the long term. It'd be more difficult if not
|
||||
impossible to achieve similar effects with other approaches.
|
||||
|
||||
This patch (of 14):
|
||||
|
||||
Some architectures automatically set the accessed bit in PTEs, e.g., x86
|
||||
and arm64 v8.2. On architectures that do not have this capability,
|
||||
clearing the accessed bit in a PTE usually triggers a page fault following
|
||||
the TLB miss of this PTE (to emulate the accessed bit).
|
||||
|
||||
Being aware of this capability can help make better decisions, e.g.,
|
||||
whether to spread the work out over a period of time to reduce bursty page
|
||||
faults when trying to clear the accessed bit in many PTEs.
|
||||
|
||||
Note that theoretically this capability can be unreliable, e.g.,
|
||||
hotplugged CPUs might be different from builtin ones. Therefore it should
|
||||
not be used in architecture-independent code that involves correctness,
|
||||
e.g., to determine whether TLB flushes are required (in combination with
|
||||
the accessed bit).
|
||||
|
||||
Link: https://lkml.kernel.org/r/20220918080010.2920238-1-yuzhao@google.com
|
||||
Link: https://lkml.kernel.org/r/20220918080010.2920238-2-yuzhao@google.com
|
||||
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
||||
Reviewed-by: Barry Song <baohua@kernel.org>
|
||||
Acked-by: Brian Geffon <bgeffon@google.com>
|
||||
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
|
||||
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
|
||||
Acked-by: Steven Barrett <steven@liquorix.net>
|
||||
Acked-by: Suleiman Souhlal <suleiman@google.com>
|
||||
Acked-by: Will Deacon <will@kernel.org>
|
||||
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
|
||||
Tested-by: Donald Carr <d@chaos-reins.com>
|
||||
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
|
||||
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
|
||||
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
|
||||
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
|
||||
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
|
||||
Cc: Andi Kleen <ak@linux.intel.com>
|
||||
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
|
||||
Cc: Catalin Marinas <catalin.marinas@arm.com>
|
||||
Cc: Dave Hansen <dave.hansen@linux.intel.com>
|
||||
Cc: Hillf Danton <hdanton@sina.com>
|
||||
Cc: Jens Axboe <axboe@kernel.dk>
|
||||
Cc: Johannes Weiner <hannes@cmpxchg.org>
|
||||
Cc: Jonathan Corbet <corbet@lwn.net>
|
||||
Cc: Linus Torvalds <torvalds@linux-foundation.org>
|
||||
Cc: linux-arm-kernel@lists.infradead.org
|
||||
Cc: Matthew Wilcox <willy@infradead.org>
|
||||
Cc: Mel Gorman <mgorman@suse.de>
|
||||
Cc: Michael Larabel <Michael@MichaelLarabel.com>
|
||||
Cc: Michal Hocko <mhocko@kernel.org>
|
||||
Cc: Mike Rapoport <rppt@kernel.org>
|
||||
Cc: Peter Zijlstra <peterz@infradead.org>
|
||||
Cc: Tejun Heo <tj@kernel.org>
|
||||
Cc: Vlastimil Babka <vbabka@suse.cz>
|
||||
Cc: Miaohe Lin <linmiaohe@huawei.com>
|
||||
Cc: Mike Rapoport <rppt@linux.ibm.com>
|
||||
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
arch/arm64/include/asm/pgtable.h | 14 ++------------
|
||||
arch/x86/include/asm/pgtable.h | 6 +++---
|
||||
include/linux/pgtable.h | 13 +++++++++++++
|
||||
mm/memory.c | 14 +-------------
|
||||
4 files changed, 19 insertions(+), 28 deletions(-)
|
||||
|
||||
--- a/arch/arm64/include/asm/pgtable.h
|
||||
+++ b/arch/arm64/include/asm/pgtable.h
|
||||
@@ -999,23 +999,13 @@ static inline void update_mmu_cache(stru
|
||||
* page after fork() + CoW for pfn mappings. We don't always have a
|
||||
* hardware-managed access flag on arm64.
|
||||
*/
|
||||
-static inline bool arch_faults_on_old_pte(void)
|
||||
-{
|
||||
- WARN_ON(preemptible());
|
||||
-
|
||||
- return !cpu_has_hw_af();
|
||||
-}
|
||||
-#define arch_faults_on_old_pte arch_faults_on_old_pte
|
||||
+#define arch_has_hw_pte_young cpu_has_hw_af
|
||||
|
||||
/*
|
||||
* Experimentally, it's cheap to set the access flag in hardware and we
|
||||
* benefit from prefaulting mappings as 'old' to start with.
|
||||
*/
|
||||
-static inline bool arch_wants_old_prefaulted_pte(void)
|
||||
-{
|
||||
- return !arch_faults_on_old_pte();
|
||||
-}
|
||||
-#define arch_wants_old_prefaulted_pte arch_wants_old_prefaulted_pte
|
||||
+#define arch_wants_old_prefaulted_pte cpu_has_hw_af
|
||||
|
||||
#endif /* !__ASSEMBLY__ */
|
||||
|
||||
--- a/arch/x86/include/asm/pgtable.h
|
||||
+++ b/arch/x86/include/asm/pgtable.h
|
||||
@@ -1397,10 +1397,10 @@ static inline bool arch_has_pfn_modify_c
|
||||
return boot_cpu_has_bug(X86_BUG_L1TF);
|
||||
}
|
||||
|
||||
-#define arch_faults_on_old_pte arch_faults_on_old_pte
|
||||
-static inline bool arch_faults_on_old_pte(void)
|
||||
+#define arch_has_hw_pte_young arch_has_hw_pte_young
|
||||
+static inline bool arch_has_hw_pte_young(void)
|
||||
{
|
||||
- return false;
|
||||
+ return true;
|
||||
}
|
||||
|
||||
#endif /* __ASSEMBLY__ */
|
||||
--- a/include/linux/pgtable.h
|
||||
+++ b/include/linux/pgtable.h
|
||||
@@ -259,6 +259,19 @@ static inline int pmdp_clear_flush_young
|
||||
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
|
||||
#endif
|
||||
|
||||
+#ifndef arch_has_hw_pte_young
|
||||
+/*
|
||||
+ * Return whether the accessed bit is supported on the local CPU.
|
||||
+ *
|
||||
+ * This stub assumes accessing through an old PTE triggers a page fault.
|
||||
+ * Architectures that automatically set the access bit should overwrite it.
|
||||
+ */
|
||||
+static inline bool arch_has_hw_pte_young(void)
|
||||
+{
|
||||
+ return false;
|
||||
+}
|
||||
+#endif
|
||||
+
|
||||
#ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR
|
||||
static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
|
||||
unsigned long address,
|
||||
--- a/mm/memory.c
|
||||
+++ b/mm/memory.c
|
||||
@@ -121,18 +121,6 @@ int randomize_va_space __read_mostly =
|
||||
2;
|
||||
#endif
|
||||
|
||||
-#ifndef arch_faults_on_old_pte
|
||||
-static inline bool arch_faults_on_old_pte(void)
|
||||
-{
|
||||
- /*
|
||||
- * Those arches which don't have hw access flag feature need to
|
||||
- * implement their own helper. By default, "true" means pagefault
|
||||
- * will be hit on old pte.
|
||||
- */
|
||||
- return true;
|
||||
-}
|
||||
-#endif
|
||||
-
|
||||
#ifndef arch_wants_old_prefaulted_pte
|
||||
static inline bool arch_wants_old_prefaulted_pte(void)
|
||||
{
|
||||
@@ -2782,7 +2770,7 @@ static inline bool cow_user_page(struct
|
||||
* On architectures with software "accessed" bits, we would
|
||||
* take a double page fault, so mark it accessed here.
|
||||
*/
|
||||
- if (arch_faults_on_old_pte() && !pte_young(vmf->orig_pte)) {
|
||||
+ if (!arch_has_hw_pte_young() && !pte_young(vmf->orig_pte)) {
|
||||
pte_t entry;
|
||||
|
||||
vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl);
|
@ -0,0 +1,153 @@
|
||||
From 493de1c4b0f2cd909169401da8c445f6c8a7e29d Mon Sep 17 00:00:00 2001
|
||||
From: Yu Zhao <yuzhao@google.com>
|
||||
Date: Sun, 18 Sep 2022 01:59:59 -0600
|
||||
Subject: [PATCH 02/29] mm: x86: add CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
Some architectures support the accessed bit in non-leaf PMD entries, e.g.,
|
||||
x86 sets the accessed bit in a non-leaf PMD entry when using it as part of
|
||||
linear address translation [1]. Page table walkers that clear the
|
||||
accessed bit may use this capability to reduce their search space.
|
||||
|
||||
Note that:
|
||||
1. Although an inline function is preferable, this capability is added
|
||||
as a configuration option for consistency with the existing macros.
|
||||
2. Due to the little interest in other varieties, this capability was
|
||||
only tested on Intel and AMD CPUs.
|
||||
|
||||
Thanks to the following developers for their efforts [2][3].
|
||||
Randy Dunlap <rdunlap@infradead.org>
|
||||
Stephen Rothwell <sfr@canb.auug.org.au>
|
||||
|
||||
[1]: Intel 64 and IA-32 Architectures Software Developer's Manual
|
||||
Volume 3 (June 2021), section 4.8
|
||||
[2] https://lore.kernel.org/r/bfdcc7c8-922f-61a9-aa15-7e7250f04af7@infradead.org/
|
||||
[3] https://lore.kernel.org/r/20220413151513.5a0d7a7e@canb.auug.org.au/
|
||||
|
||||
Link: https://lkml.kernel.org/r/20220918080010.2920238-3-yuzhao@google.com
|
||||
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
||||
Reviewed-by: Barry Song <baohua@kernel.org>
|
||||
Acked-by: Brian Geffon <bgeffon@google.com>
|
||||
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
|
||||
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
|
||||
Acked-by: Steven Barrett <steven@liquorix.net>
|
||||
Acked-by: Suleiman Souhlal <suleiman@google.com>
|
||||
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
|
||||
Tested-by: Donald Carr <d@chaos-reins.com>
|
||||
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
|
||||
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
|
||||
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
|
||||
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
|
||||
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
|
||||
Cc: Andi Kleen <ak@linux.intel.com>
|
||||
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
|
||||
Cc: Catalin Marinas <catalin.marinas@arm.com>
|
||||
Cc: Dave Hansen <dave.hansen@linux.intel.com>
|
||||
Cc: Hillf Danton <hdanton@sina.com>
|
||||
Cc: Jens Axboe <axboe@kernel.dk>
|
||||
Cc: Johannes Weiner <hannes@cmpxchg.org>
|
||||
Cc: Jonathan Corbet <corbet@lwn.net>
|
||||
Cc: Linus Torvalds <torvalds@linux-foundation.org>
|
||||
Cc: Matthew Wilcox <willy@infradead.org>
|
||||
Cc: Mel Gorman <mgorman@suse.de>
|
||||
Cc: Miaohe Lin <linmiaohe@huawei.com>
|
||||
Cc: Michael Larabel <Michael@MichaelLarabel.com>
|
||||
Cc: Michal Hocko <mhocko@kernel.org>
|
||||
Cc: Mike Rapoport <rppt@kernel.org>
|
||||
Cc: Mike Rapoport <rppt@linux.ibm.com>
|
||||
Cc: Peter Zijlstra <peterz@infradead.org>
|
||||
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
|
||||
Cc: Tejun Heo <tj@kernel.org>
|
||||
Cc: Vlastimil Babka <vbabka@suse.cz>
|
||||
Cc: Will Deacon <will@kernel.org>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
arch/Kconfig | 8 ++++++++
|
||||
arch/x86/Kconfig | 1 +
|
||||
arch/x86/include/asm/pgtable.h | 3 ++-
|
||||
arch/x86/mm/pgtable.c | 5 ++++-
|
||||
include/linux/pgtable.h | 4 ++--
|
||||
5 files changed, 17 insertions(+), 4 deletions(-)
|
||||
|
||||
--- a/arch/Kconfig
|
||||
+++ b/arch/Kconfig
|
||||
@@ -1295,6 +1295,14 @@ config ARCH_HAS_ELFCORE_COMPAT
|
||||
config ARCH_HAS_PARANOID_L1D_FLUSH
|
||||
bool
|
||||
|
||||
+config ARCH_HAS_NONLEAF_PMD_YOUNG
|
||||
+ bool
|
||||
+ help
|
||||
+ Architectures that select this option are capable of setting the
|
||||
+ accessed bit in non-leaf PMD entries when using them as part of linear
|
||||
+ address translations. Page table walkers that clear the accessed bit
|
||||
+ may use this capability to reduce their search space.
|
||||
+
|
||||
source "kernel/gcov/Kconfig"
|
||||
|
||||
source "scripts/gcc-plugins/Kconfig"
|
||||
--- a/arch/x86/Kconfig
|
||||
+++ b/arch/x86/Kconfig
|
||||
@@ -84,6 +84,7 @@ config X86
|
||||
select ARCH_HAS_PMEM_API if X86_64
|
||||
select ARCH_HAS_PTE_DEVMAP if X86_64
|
||||
select ARCH_HAS_PTE_SPECIAL
|
||||
+ select ARCH_HAS_NONLEAF_PMD_YOUNG if PGTABLE_LEVELS > 2
|
||||
select ARCH_HAS_UACCESS_FLUSHCACHE if X86_64
|
||||
select ARCH_HAS_COPY_MC if X86_64
|
||||
select ARCH_HAS_SET_MEMORY
|
||||
--- a/arch/x86/include/asm/pgtable.h
|
||||
+++ b/arch/x86/include/asm/pgtable.h
|
||||
@@ -817,7 +817,8 @@ static inline unsigned long pmd_page_vad
|
||||
|
||||
static inline int pmd_bad(pmd_t pmd)
|
||||
{
|
||||
- return (pmd_flags(pmd) & ~_PAGE_USER) != _KERNPG_TABLE;
|
||||
+ return (pmd_flags(pmd) & ~(_PAGE_USER | _PAGE_ACCESSED)) !=
|
||||
+ (_KERNPG_TABLE & ~_PAGE_ACCESSED);
|
||||
}
|
||||
|
||||
static inline unsigned long pages_to_mb(unsigned long npg)
|
||||
--- a/arch/x86/mm/pgtable.c
|
||||
+++ b/arch/x86/mm/pgtable.c
|
||||
@@ -550,7 +550,7 @@ int ptep_test_and_clear_young(struct vm_
|
||||
return ret;
|
||||
}
|
||||
|
||||
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
||||
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
|
||||
int pmdp_test_and_clear_young(struct vm_area_struct *vma,
|
||||
unsigned long addr, pmd_t *pmdp)
|
||||
{
|
||||
@@ -562,6 +562,9 @@ int pmdp_test_and_clear_young(struct vm_
|
||||
|
||||
return ret;
|
||||
}
|
||||
+#endif
|
||||
+
|
||||
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
||||
int pudp_test_and_clear_young(struct vm_area_struct *vma,
|
||||
unsigned long addr, pud_t *pudp)
|
||||
{
|
||||
--- a/include/linux/pgtable.h
|
||||
+++ b/include/linux/pgtable.h
|
||||
@@ -212,7 +212,7 @@ static inline int ptep_test_and_clear_yo
|
||||
#endif
|
||||
|
||||
#ifndef __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
|
||||
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
||||
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
|
||||
static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
|
||||
unsigned long address,
|
||||
pmd_t *pmdp)
|
||||
@@ -233,7 +233,7 @@ static inline int pmdp_test_and_clear_yo
|
||||
BUILD_BUG();
|
||||
return 0;
|
||||
}
|
||||
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
|
||||
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG */
|
||||
#endif
|
||||
|
||||
#ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
|
@ -0,0 +1,275 @@
|
||||
From 9e17efd11450d3d2069adaa3c58db9ac8ebd1c66 Mon Sep 17 00:00:00 2001
|
||||
From: Yu Zhao <yuzhao@google.com>
|
||||
Date: Sun, 18 Sep 2022 02:00:00 -0600
|
||||
Subject: [PATCH 03/29] mm/vmscan.c: refactor shrink_node()
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
This patch refactors shrink_node() to improve readability for the upcoming
|
||||
changes to mm/vmscan.c.
|
||||
|
||||
Link: https://lkml.kernel.org/r/20220918080010.2920238-4-yuzhao@google.com
|
||||
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
||||
Reviewed-by: Barry Song <baohua@kernel.org>
|
||||
Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
|
||||
Acked-by: Brian Geffon <bgeffon@google.com>
|
||||
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
|
||||
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
|
||||
Acked-by: Steven Barrett <steven@liquorix.net>
|
||||
Acked-by: Suleiman Souhlal <suleiman@google.com>
|
||||
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
|
||||
Tested-by: Donald Carr <d@chaos-reins.com>
|
||||
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
|
||||
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
|
||||
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
|
||||
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
|
||||
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
|
||||
Cc: Andi Kleen <ak@linux.intel.com>
|
||||
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
|
||||
Cc: Catalin Marinas <catalin.marinas@arm.com>
|
||||
Cc: Dave Hansen <dave.hansen@linux.intel.com>
|
||||
Cc: Hillf Danton <hdanton@sina.com>
|
||||
Cc: Jens Axboe <axboe@kernel.dk>
|
||||
Cc: Johannes Weiner <hannes@cmpxchg.org>
|
||||
Cc: Jonathan Corbet <corbet@lwn.net>
|
||||
Cc: Linus Torvalds <torvalds@linux-foundation.org>
|
||||
Cc: Matthew Wilcox <willy@infradead.org>
|
||||
Cc: Mel Gorman <mgorman@suse.de>
|
||||
Cc: Michael Larabel <Michael@MichaelLarabel.com>
|
||||
Cc: Michal Hocko <mhocko@kernel.org>
|
||||
Cc: Mike Rapoport <rppt@kernel.org>
|
||||
Cc: Mike Rapoport <rppt@linux.ibm.com>
|
||||
Cc: Peter Zijlstra <peterz@infradead.org>
|
||||
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
|
||||
Cc: Tejun Heo <tj@kernel.org>
|
||||
Cc: Vlastimil Babka <vbabka@suse.cz>
|
||||
Cc: Will Deacon <will@kernel.org>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
mm/vmscan.c | 198 +++++++++++++++++++++++++++-------------------------
|
||||
1 file changed, 104 insertions(+), 94 deletions(-)
|
||||
|
||||
--- a/mm/vmscan.c
|
||||
+++ b/mm/vmscan.c
|
||||
@@ -2497,6 +2497,109 @@ enum scan_balance {
|
||||
SCAN_FILE,
|
||||
};
|
||||
|
||||
+static void prepare_scan_count(pg_data_t *pgdat, struct scan_control *sc)
|
||||
+{
|
||||
+ unsigned long file;
|
||||
+ struct lruvec *target_lruvec;
|
||||
+
|
||||
+ target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
|
||||
+
|
||||
+ /*
|
||||
+ * Flush the memory cgroup stats, so that we read accurate per-memcg
|
||||
+ * lruvec stats for heuristics.
|
||||
+ */
|
||||
+ mem_cgroup_flush_stats();
|
||||
+
|
||||
+ /*
|
||||
+ * Determine the scan balance between anon and file LRUs.
|
||||
+ */
|
||||
+ spin_lock_irq(&target_lruvec->lru_lock);
|
||||
+ sc->anon_cost = target_lruvec->anon_cost;
|
||||
+ sc->file_cost = target_lruvec->file_cost;
|
||||
+ spin_unlock_irq(&target_lruvec->lru_lock);
|
||||
+
|
||||
+ /*
|
||||
+ * Target desirable inactive:active list ratios for the anon
|
||||
+ * and file LRU lists.
|
||||
+ */
|
||||
+ if (!sc->force_deactivate) {
|
||||
+ unsigned long refaults;
|
||||
+
|
||||
+ refaults = lruvec_page_state(target_lruvec,
|
||||
+ WORKINGSET_ACTIVATE_ANON);
|
||||
+ if (refaults != target_lruvec->refaults[0] ||
|
||||
+ inactive_is_low(target_lruvec, LRU_INACTIVE_ANON))
|
||||
+ sc->may_deactivate |= DEACTIVATE_ANON;
|
||||
+ else
|
||||
+ sc->may_deactivate &= ~DEACTIVATE_ANON;
|
||||
+
|
||||
+ /*
|
||||
+ * When refaults are being observed, it means a new
|
||||
+ * workingset is being established. Deactivate to get
|
||||
+ * rid of any stale active pages quickly.
|
||||
+ */
|
||||
+ refaults = lruvec_page_state(target_lruvec,
|
||||
+ WORKINGSET_ACTIVATE_FILE);
|
||||
+ if (refaults != target_lruvec->refaults[1] ||
|
||||
+ inactive_is_low(target_lruvec, LRU_INACTIVE_FILE))
|
||||
+ sc->may_deactivate |= DEACTIVATE_FILE;
|
||||
+ else
|
||||
+ sc->may_deactivate &= ~DEACTIVATE_FILE;
|
||||
+ } else
|
||||
+ sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE;
|
||||
+
|
||||
+ /*
|
||||
+ * If we have plenty of inactive file pages that aren't
|
||||
+ * thrashing, try to reclaim those first before touching
|
||||
+ * anonymous pages.
|
||||
+ */
|
||||
+ file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE);
|
||||
+ if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE))
|
||||
+ sc->cache_trim_mode = 1;
|
||||
+ else
|
||||
+ sc->cache_trim_mode = 0;
|
||||
+
|
||||
+ /*
|
||||
+ * Prevent the reclaimer from falling into the cache trap: as
|
||||
+ * cache pages start out inactive, every cache fault will tip
|
||||
+ * the scan balance towards the file LRU. And as the file LRU
|
||||
+ * shrinks, so does the window for rotation from references.
|
||||
+ * This means we have a runaway feedback loop where a tiny
|
||||
+ * thrashing file LRU becomes infinitely more attractive than
|
||||
+ * anon pages. Try to detect this based on file LRU size.
|
||||
+ */
|
||||
+ if (!cgroup_reclaim(sc)) {
|
||||
+ unsigned long total_high_wmark = 0;
|
||||
+ unsigned long free, anon;
|
||||
+ int z;
|
||||
+
|
||||
+ free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);
|
||||
+ file = node_page_state(pgdat, NR_ACTIVE_FILE) +
|
||||
+ node_page_state(pgdat, NR_INACTIVE_FILE);
|
||||
+
|
||||
+ for (z = 0; z < MAX_NR_ZONES; z++) {
|
||||
+ struct zone *zone = &pgdat->node_zones[z];
|
||||
+
|
||||
+ if (!managed_zone(zone))
|
||||
+ continue;
|
||||
+
|
||||
+ total_high_wmark += high_wmark_pages(zone);
|
||||
+ }
|
||||
+
|
||||
+ /*
|
||||
+ * Consider anon: if that's low too, this isn't a
|
||||
+ * runaway file reclaim problem, but rather just
|
||||
+ * extreme pressure. Reclaim as per usual then.
|
||||
+ */
|
||||
+ anon = node_page_state(pgdat, NR_INACTIVE_ANON);
|
||||
+
|
||||
+ sc->file_is_tiny =
|
||||
+ file + free <= total_high_wmark &&
|
||||
+ !(sc->may_deactivate & DEACTIVATE_ANON) &&
|
||||
+ anon >> sc->priority;
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
/*
|
||||
* Determine how aggressively the anon and file LRU lists should be
|
||||
* scanned. The relative value of each set of LRU lists is determined
|
||||
@@ -2965,109 +3068,16 @@ static void shrink_node(pg_data_t *pgdat
|
||||
unsigned long nr_reclaimed, nr_scanned;
|
||||
struct lruvec *target_lruvec;
|
||||
bool reclaimable = false;
|
||||
- unsigned long file;
|
||||
|
||||
target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
|
||||
|
||||
again:
|
||||
- /*
|
||||
- * Flush the memory cgroup stats, so that we read accurate per-memcg
|
||||
- * lruvec stats for heuristics.
|
||||
- */
|
||||
- mem_cgroup_flush_stats();
|
||||
-
|
||||
memset(&sc->nr, 0, sizeof(sc->nr));
|
||||
|
||||
nr_reclaimed = sc->nr_reclaimed;
|
||||
nr_scanned = sc->nr_scanned;
|
||||
|
||||
- /*
|
||||
- * Determine the scan balance between anon and file LRUs.
|
||||
- */
|
||||
- spin_lock_irq(&target_lruvec->lru_lock);
|
||||
- sc->anon_cost = target_lruvec->anon_cost;
|
||||
- sc->file_cost = target_lruvec->file_cost;
|
||||
- spin_unlock_irq(&target_lruvec->lru_lock);
|
||||
-
|
||||
- /*
|
||||
- * Target desirable inactive:active list ratios for the anon
|
||||
- * and file LRU lists.
|
||||
- */
|
||||
- if (!sc->force_deactivate) {
|
||||
- unsigned long refaults;
|
||||
-
|
||||
- refaults = lruvec_page_state(target_lruvec,
|
||||
- WORKINGSET_ACTIVATE_ANON);
|
||||
- if (refaults != target_lruvec->refaults[0] ||
|
||||
- inactive_is_low(target_lruvec, LRU_INACTIVE_ANON))
|
||||
- sc->may_deactivate |= DEACTIVATE_ANON;
|
||||
- else
|
||||
- sc->may_deactivate &= ~DEACTIVATE_ANON;
|
||||
-
|
||||
- /*
|
||||
- * When refaults are being observed, it means a new
|
||||
- * workingset is being established. Deactivate to get
|
||||
- * rid of any stale active pages quickly.
|
||||
- */
|
||||
- refaults = lruvec_page_state(target_lruvec,
|
||||
- WORKINGSET_ACTIVATE_FILE);
|
||||
- if (refaults != target_lruvec->refaults[1] ||
|
||||
- inactive_is_low(target_lruvec, LRU_INACTIVE_FILE))
|
||||
- sc->may_deactivate |= DEACTIVATE_FILE;
|
||||
- else
|
||||
- sc->may_deactivate &= ~DEACTIVATE_FILE;
|
||||
- } else
|
||||
- sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE;
|
||||
-
|
||||
- /*
|
||||
- * If we have plenty of inactive file pages that aren't
|
||||
- * thrashing, try to reclaim those first before touching
|
||||
- * anonymous pages.
|
||||
- */
|
||||
- file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE);
|
||||
- if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE))
|
||||
- sc->cache_trim_mode = 1;
|
||||
- else
|
||||
- sc->cache_trim_mode = 0;
|
||||
-
|
||||
- /*
|
||||
- * Prevent the reclaimer from falling into the cache trap: as
|
||||
- * cache pages start out inactive, every cache fault will tip
|
||||
- * the scan balance towards the file LRU. And as the file LRU
|
||||
- * shrinks, so does the window for rotation from references.
|
||||
- * This means we have a runaway feedback loop where a tiny
|
||||
- * thrashing file LRU becomes infinitely more attractive than
|
||||
- * anon pages. Try to detect this based on file LRU size.
|
||||
- */
|
||||
- if (!cgroup_reclaim(sc)) {
|
||||
- unsigned long total_high_wmark = 0;
|
||||
- unsigned long free, anon;
|
||||
- int z;
|
||||
-
|
||||
- free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);
|
||||
- file = node_page_state(pgdat, NR_ACTIVE_FILE) +
|
||||
- node_page_state(pgdat, NR_INACTIVE_FILE);
|
||||
-
|
||||
- for (z = 0; z < MAX_NR_ZONES; z++) {
|
||||
- struct zone *zone = &pgdat->node_zones[z];
|
||||
- if (!managed_zone(zone))
|
||||
- continue;
|
||||
-
|
||||
- total_high_wmark += high_wmark_pages(zone);
|
||||
- }
|
||||
-
|
||||
- /*
|
||||
- * Consider anon: if that's low too, this isn't a
|
||||
- * runaway file reclaim problem, but rather just
|
||||
- * extreme pressure. Reclaim as per usual then.
|
||||
- */
|
||||
- anon = node_page_state(pgdat, NR_INACTIVE_ANON);
|
||||
-
|
||||
- sc->file_is_tiny =
|
||||
- file + free <= total_high_wmark &&
|
||||
- !(sc->may_deactivate & DEACTIVATE_ANON) &&
|
||||
- anon >> sc->priority;
|
||||
- }
|
||||
+ prepare_scan_count(pgdat, sc);
|
||||
|
||||
shrink_node_memcgs(pgdat, sc);
|
||||
|
@ -0,0 +1,82 @@
|
||||
From 03705be42114db7cc5bd6eb7bf7e8703c94d4880 Mon Sep 17 00:00:00 2001
|
||||
From: Yu Zhao <yuzhao@google.com>
|
||||
Date: Sun, 18 Sep 2022 02:00:01 -0600
|
||||
Subject: [PATCH 04/29] Revert "include/linux/mm_inline.h: fold
|
||||
__update_lru_size() into its sole caller"
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
This patch undoes the following refactor: commit 289ccba18af4
|
||||
("include/linux/mm_inline.h: fold __update_lru_size() into its sole
|
||||
caller")
|
||||
|
||||
The upcoming changes to include/linux/mm_inline.h will reuse
|
||||
__update_lru_size().
|
||||
|
||||
Link: https://lkml.kernel.org/r/20220918080010.2920238-5-yuzhao@google.com
|
||||
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
||||
Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
|
||||
Acked-by: Brian Geffon <bgeffon@google.com>
|
||||
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
|
||||
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
|
||||
Acked-by: Steven Barrett <steven@liquorix.net>
|
||||
Acked-by: Suleiman Souhlal <suleiman@google.com>
|
||||
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
|
||||
Tested-by: Donald Carr <d@chaos-reins.com>
|
||||
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
|
||||
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
|
||||
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
|
||||
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
|
||||
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
|
||||
Cc: Andi Kleen <ak@linux.intel.com>
|
||||
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
|
||||
Cc: Barry Song <baohua@kernel.org>
|
||||
Cc: Catalin Marinas <catalin.marinas@arm.com>
|
||||
Cc: Dave Hansen <dave.hansen@linux.intel.com>
|
||||
Cc: Hillf Danton <hdanton@sina.com>
|
||||
Cc: Jens Axboe <axboe@kernel.dk>
|
||||
Cc: Johannes Weiner <hannes@cmpxchg.org>
|
||||
Cc: Jonathan Corbet <corbet@lwn.net>
|
||||
Cc: Linus Torvalds <torvalds@linux-foundation.org>
|
||||
Cc: Matthew Wilcox <willy@infradead.org>
|
||||
Cc: Mel Gorman <mgorman@suse.de>
|
||||
Cc: Michael Larabel <Michael@MichaelLarabel.com>
|
||||
Cc: Michal Hocko <mhocko@kernel.org>
|
||||
Cc: Mike Rapoport <rppt@kernel.org>
|
||||
Cc: Mike Rapoport <rppt@linux.ibm.com>
|
||||
Cc: Peter Zijlstra <peterz@infradead.org>
|
||||
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
|
||||
Cc: Tejun Heo <tj@kernel.org>
|
||||
Cc: Vlastimil Babka <vbabka@suse.cz>
|
||||
Cc: Will Deacon <will@kernel.org>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
include/linux/mm_inline.h | 9 ++++++++-
|
||||
1 file changed, 8 insertions(+), 1 deletion(-)
|
||||
|
||||
--- a/include/linux/mm_inline.h
|
||||
+++ b/include/linux/mm_inline.h
|
||||
@@ -24,7 +24,7 @@ static inline int page_is_file_lru(struc
|
||||
return !PageSwapBacked(page);
|
||||
}
|
||||
|
||||
-static __always_inline void update_lru_size(struct lruvec *lruvec,
|
||||
+static __always_inline void __update_lru_size(struct lruvec *lruvec,
|
||||
enum lru_list lru, enum zone_type zid,
|
||||
int nr_pages)
|
||||
{
|
||||
@@ -33,6 +33,13 @@ static __always_inline void update_lru_s
|
||||
__mod_lruvec_state(lruvec, NR_LRU_BASE + lru, nr_pages);
|
||||
__mod_zone_page_state(&pgdat->node_zones[zid],
|
||||
NR_ZONE_LRU_BASE + lru, nr_pages);
|
||||
+}
|
||||
+
|
||||
+static __always_inline void update_lru_size(struct lruvec *lruvec,
|
||||
+ enum lru_list lru, enum zone_type zid,
|
||||
+ long nr_pages)
|
||||
+{
|
||||
+ __update_lru_size(lruvec, lru, zid, nr_pages);
|
||||
#ifdef CONFIG_MEMCG
|
||||
mem_cgroup_update_lru_size(lruvec, lru, zid, nr_pages);
|
||||
#endif
|
@ -0,0 +1,807 @@
|
||||
From a9b328add8422921a0dbbef162730800e16e8cfd Mon Sep 17 00:00:00 2001
|
||||
From: Yu Zhao <yuzhao@google.com>
|
||||
Date: Sun, 18 Sep 2022 02:00:02 -0600
|
||||
Subject: [PATCH 05/29] mm: multi-gen LRU: groundwork
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
Evictable pages are divided into multiple generations for each lruvec.
|
||||
The youngest generation number is stored in lrugen->max_seq for both
|
||||
anon and file types as they are aged on an equal footing. The oldest
|
||||
generation numbers are stored in lrugen->min_seq[] separately for anon
|
||||
and file types as clean file pages can be evicted regardless of swap
|
||||
constraints. These three variables are monotonically increasing.
|
||||
|
||||
Generation numbers are truncated into order_base_2(MAX_NR_GENS+1) bits
|
||||
in order to fit into the gen counter in page->flags. Each truncated
|
||||
generation number is an index to lrugen->lists[]. The sliding window
|
||||
technique is used to track at least MIN_NR_GENS and at most
|
||||
MAX_NR_GENS generations. The gen counter stores a value within [1,
|
||||
MAX_NR_GENS] while a page is on one of lrugen->lists[]. Otherwise it
|
||||
stores 0.
|
||||
|
||||
There are two conceptually independent procedures: "the aging", which
|
||||
produces young generations, and "the eviction", which consumes old
|
||||
generations. They form a closed-loop system, i.e., "the page reclaim".
|
||||
Both procedures can be invoked from userspace for the purposes of working
|
||||
set estimation and proactive reclaim. These techniques are commonly used
|
||||
to optimize job scheduling (bin packing) in data centers [1][2].
|
||||
|
||||
To avoid confusion, the terms "hot" and "cold" will be applied to the
|
||||
multi-gen LRU, as a new convention; the terms "active" and "inactive" will
|
||||
be applied to the active/inactive LRU, as usual.
|
||||
|
||||
The protection of hot pages and the selection of cold pages are based
|
||||
on page access channels and patterns. There are two access channels:
|
||||
one through page tables and the other through file descriptors. The
|
||||
protection of the former channel is by design stronger because:
|
||||
1. The uncertainty in determining the access patterns of the former
|
||||
channel is higher due to the approximation of the accessed bit.
|
||||
2. The cost of evicting the former channel is higher due to the TLB
|
||||
flushes required and the likelihood of encountering the dirty bit.
|
||||
3. The penalty of underprotecting the former channel is higher because
|
||||
applications usually do not prepare themselves for major page
|
||||
faults like they do for blocked I/O. E.g., GUI applications
|
||||
commonly use dedicated I/O threads to avoid blocking rendering
|
||||
threads.
|
||||
|
||||
There are also two access patterns: one with temporal locality and the
|
||||
other without. For the reasons listed above, the former channel is
|
||||
assumed to follow the former pattern unless VM_SEQ_READ or VM_RAND_READ is
|
||||
present; the latter channel is assumed to follow the latter pattern unless
|
||||
outlying refaults have been observed [3][4].
|
||||
|
||||
The next patch will address the "outlying refaults". Three macros, i.e.,
|
||||
LRU_REFS_WIDTH, LRU_REFS_PGOFF and LRU_REFS_MASK, used later are added in
|
||||
this patch to make the entire patchset less diffy.
|
||||
|
||||
A page is added to the youngest generation on faulting. The aging needs
|
||||
to check the accessed bit at least twice before handing this page over to
|
||||
the eviction. The first check takes care of the accessed bit set on the
|
||||
initial fault; the second check makes sure this page has not been used
|
||||
since then. This protocol, AKA second chance, requires a minimum of two
|
||||
generations, hence MIN_NR_GENS.
|
||||
|
||||
[1] https://dl.acm.org/doi/10.1145/3297858.3304053
|
||||
[2] https://dl.acm.org/doi/10.1145/3503222.3507731
|
||||
[3] https://lwn.net/Articles/495543/
|
||||
[4] https://lwn.net/Articles/815342/
|
||||
|
||||
Link: https://lkml.kernel.org/r/20220918080010.2920238-6-yuzhao@google.com
|
||||
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
||||
Acked-by: Brian Geffon <bgeffon@google.com>
|
||||
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
|
||||
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
|
||||
Acked-by: Steven Barrett <steven@liquorix.net>
|
||||
Acked-by: Suleiman Souhlal <suleiman@google.com>
|
||||
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
|
||||
Tested-by: Donald Carr <d@chaos-reins.com>
|
||||
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
|
||||
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
|
||||
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
|
||||
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
|
||||
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
|
||||
Cc: Andi Kleen <ak@linux.intel.com>
|
||||
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
|
||||
Cc: Barry Song <baohua@kernel.org>
|
||||
Cc: Catalin Marinas <catalin.marinas@arm.com>
|
||||
Cc: Dave Hansen <dave.hansen@linux.intel.com>
|
||||
Cc: Hillf Danton <hdanton@sina.com>
|
||||
Cc: Jens Axboe <axboe@kernel.dk>
|
||||
Cc: Johannes Weiner <hannes@cmpxchg.org>
|
||||
Cc: Jonathan Corbet <corbet@lwn.net>
|
||||
Cc: Linus Torvalds <torvalds@linux-foundation.org>
|
||||
Cc: Matthew Wilcox <willy@infradead.org>
|
||||
Cc: Mel Gorman <mgorman@suse.de>
|
||||
Cc: Miaohe Lin <linmiaohe@huawei.com>
|
||||
Cc: Michael Larabel <Michael@MichaelLarabel.com>
|
||||
Cc: Michal Hocko <mhocko@kernel.org>
|
||||
Cc: Mike Rapoport <rppt@kernel.org>
|
||||
Cc: Mike Rapoport <rppt@linux.ibm.com>
|
||||
Cc: Peter Zijlstra <peterz@infradead.org>
|
||||
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
|
||||
Cc: Tejun Heo <tj@kernel.org>
|
||||
Cc: Vlastimil Babka <vbabka@suse.cz>
|
||||
Cc: Will Deacon <will@kernel.org>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
fs/fuse/dev.c | 3 +-
|
||||
include/linux/mm.h | 2 +
|
||||
include/linux/mm_inline.h | 177 +++++++++++++++++++++++++++++-
|
||||
include/linux/mmzone.h | 100 +++++++++++++++++
|
||||
include/linux/page-flags-layout.h | 13 ++-
|
||||
include/linux/page-flags.h | 4 +-
|
||||
include/linux/sched.h | 4 +
|
||||
kernel/bounds.c | 5 +
|
||||
mm/Kconfig | 8 ++
|
||||
mm/huge_memory.c | 3 +-
|
||||
mm/memcontrol.c | 2 +
|
||||
mm/memory.c | 25 +++++
|
||||
mm/mm_init.c | 6 +-
|
||||
mm/mmzone.c | 2 +
|
||||
mm/swap.c | 10 +-
|
||||
mm/vmscan.c | 75 +++++++++++++
|
||||
16 files changed, 425 insertions(+), 14 deletions(-)
|
||||
|
||||
--- a/fs/fuse/dev.c
|
||||
+++ b/fs/fuse/dev.c
|
||||
@@ -785,7 +785,8 @@ static int fuse_check_page(struct page *
|
||||
1 << PG_active |
|
||||
1 << PG_workingset |
|
||||
1 << PG_reclaim |
|
||||
- 1 << PG_waiters))) {
|
||||
+ 1 << PG_waiters |
|
||||
+ LRU_GEN_MASK | LRU_REFS_MASK))) {
|
||||
dump_page(page, "fuse: trying to steal weird page");
|
||||
return 1;
|
||||
}
|
||||
--- a/include/linux/mm.h
|
||||
+++ b/include/linux/mm.h
|
||||
@@ -1093,6 +1093,8 @@ vm_fault_t finish_mkwrite_fault(struct v
|
||||
#define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH)
|
||||
#define LAST_CPUPID_PGOFF (ZONES_PGOFF - LAST_CPUPID_WIDTH)
|
||||
#define KASAN_TAG_PGOFF (LAST_CPUPID_PGOFF - KASAN_TAG_WIDTH)
|
||||
+#define LRU_GEN_PGOFF (KASAN_TAG_PGOFF - LRU_GEN_WIDTH)
|
||||
+#define LRU_REFS_PGOFF (LRU_GEN_PGOFF - LRU_REFS_WIDTH)
|
||||
|
||||
/*
|
||||
* Define the bit shifts to access each section. For non-existent
|
||||
--- a/include/linux/mm_inline.h
|
||||
+++ b/include/linux/mm_inline.h
|
||||
@@ -26,10 +26,13 @@ static inline int page_is_file_lru(struc
|
||||
|
||||
static __always_inline void __update_lru_size(struct lruvec *lruvec,
|
||||
enum lru_list lru, enum zone_type zid,
|
||||
- int nr_pages)
|
||||
+ long nr_pages)
|
||||
{
|
||||
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
|
||||
|
||||
+ lockdep_assert_held(&lruvec->lru_lock);
|
||||
+ WARN_ON_ONCE(nr_pages != (int)nr_pages);
|
||||
+
|
||||
__mod_lruvec_state(lruvec, NR_LRU_BASE + lru, nr_pages);
|
||||
__mod_zone_page_state(&pgdat->node_zones[zid],
|
||||
NR_ZONE_LRU_BASE + lru, nr_pages);
|
||||
@@ -86,11 +89,177 @@ static __always_inline enum lru_list pag
|
||||
return lru;
|
||||
}
|
||||
|
||||
+#ifdef CONFIG_LRU_GEN
|
||||
+
|
||||
+static inline bool lru_gen_enabled(void)
|
||||
+{
|
||||
+ return true;
|
||||
+}
|
||||
+
|
||||
+static inline bool lru_gen_in_fault(void)
|
||||
+{
|
||||
+ return current->in_lru_fault;
|
||||
+}
|
||||
+
|
||||
+static inline int lru_gen_from_seq(unsigned long seq)
|
||||
+{
|
||||
+ return seq % MAX_NR_GENS;
|
||||
+}
|
||||
+
|
||||
+static inline int page_lru_gen(struct page *page)
|
||||
+{
|
||||
+ unsigned long flags = READ_ONCE(page->flags);
|
||||
+
|
||||
+ return ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
|
||||
+}
|
||||
+
|
||||
+static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen)
|
||||
+{
|
||||
+ unsigned long max_seq = lruvec->lrugen.max_seq;
|
||||
+
|
||||
+ VM_WARN_ON_ONCE(gen >= MAX_NR_GENS);
|
||||
+
|
||||
+ /* see the comment on MIN_NR_GENS */
|
||||
+ return gen == lru_gen_from_seq(max_seq) || gen == lru_gen_from_seq(max_seq - 1);
|
||||
+}
|
||||
+
|
||||
+static inline void lru_gen_update_size(struct lruvec *lruvec, struct page *page,
|
||||
+ int old_gen, int new_gen)
|
||||
+{
|
||||
+ int type = page_is_file_lru(page);
|
||||
+ int zone = page_zonenum(page);
|
||||
+ int delta = thp_nr_pages(page);
|
||||
+ enum lru_list lru = type * LRU_INACTIVE_FILE;
|
||||
+ struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
||||
+
|
||||
+ VM_WARN_ON_ONCE(old_gen != -1 && old_gen >= MAX_NR_GENS);
|
||||
+ VM_WARN_ON_ONCE(new_gen != -1 && new_gen >= MAX_NR_GENS);
|
||||
+ VM_WARN_ON_ONCE(old_gen == -1 && new_gen == -1);
|
||||
+
|
||||
+ if (old_gen >= 0)
|
||||
+ WRITE_ONCE(lrugen->nr_pages[old_gen][type][zone],
|
||||
+ lrugen->nr_pages[old_gen][type][zone] - delta);
|
||||
+ if (new_gen >= 0)
|
||||
+ WRITE_ONCE(lrugen->nr_pages[new_gen][type][zone],
|
||||
+ lrugen->nr_pages[new_gen][type][zone] + delta);
|
||||
+
|
||||
+ /* addition */
|
||||
+ if (old_gen < 0) {
|
||||
+ if (lru_gen_is_active(lruvec, new_gen))
|
||||
+ lru += LRU_ACTIVE;
|
||||
+ __update_lru_size(lruvec, lru, zone, delta);
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
+ /* deletion */
|
||||
+ if (new_gen < 0) {
|
||||
+ if (lru_gen_is_active(lruvec, old_gen))
|
||||
+ lru += LRU_ACTIVE;
|
||||
+ __update_lru_size(lruvec, lru, zone, -delta);
|
||||
+ return;
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
+static inline bool lru_gen_add_page(struct lruvec *lruvec, struct page *page, bool reclaiming)
|
||||
+{
|
||||
+ unsigned long seq;
|
||||
+ unsigned long flags;
|
||||
+ int gen = page_lru_gen(page);
|
||||
+ int type = page_is_file_lru(page);
|
||||
+ int zone = page_zonenum(page);
|
||||
+ struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
||||
+
|
||||
+ VM_WARN_ON_ONCE_PAGE(gen != -1, page);
|
||||
+
|
||||
+ if (PageUnevictable(page))
|
||||
+ return false;
|
||||
+ /*
|
||||
+ * There are three common cases for this page:
|
||||
+ * 1. If it's hot, e.g., freshly faulted in or previously hot and
|
||||
+ * migrated, add it to the youngest generation.
|
||||
+ * 2. If it's cold but can't be evicted immediately, i.e., an anon page
|
||||
+ * not in swapcache or a dirty page pending writeback, add it to the
|
||||
+ * second oldest generation.
|
||||
+ * 3. Everything else (clean, cold) is added to the oldest generation.
|
||||
+ */
|
||||
+ if (PageActive(page))
|
||||
+ seq = lrugen->max_seq;
|
||||
+ else if ((type == LRU_GEN_ANON && !PageSwapCache(page)) ||
|
||||
+ (PageReclaim(page) &&
|
||||
+ (PageDirty(page) || PageWriteback(page))))
|
||||
+ seq = lrugen->min_seq[type] + 1;
|
||||
+ else
|
||||
+ seq = lrugen->min_seq[type];
|
||||
+
|
||||
+ gen = lru_gen_from_seq(seq);
|
||||
+ flags = (gen + 1UL) << LRU_GEN_PGOFF;
|
||||
+ /* see the comment on MIN_NR_GENS about PG_active */
|
||||
+ set_mask_bits(&page->flags, LRU_GEN_MASK | BIT(PG_active), flags);
|
||||
+
|
||||
+ lru_gen_update_size(lruvec, page, -1, gen);
|
||||
+ /* for rotate_reclaimable_page() */
|
||||
+ if (reclaiming)
|
||||
+ list_add_tail(&page->lru, &lrugen->lists[gen][type][zone]);
|
||||
+ else
|
||||
+ list_add(&page->lru, &lrugen->lists[gen][type][zone]);
|
||||
+
|
||||
+ return true;
|
||||
+}
|
||||
+
|
||||
+static inline bool lru_gen_del_page(struct lruvec *lruvec, struct page *page, bool reclaiming)
|
||||
+{
|
||||
+ unsigned long flags;
|
||||
+ int gen = page_lru_gen(page);
|
||||
+
|
||||
+ if (gen < 0)
|
||||
+ return false;
|
||||
+
|
||||
+ VM_WARN_ON_ONCE_PAGE(PageActive(page), page);
|
||||
+ VM_WARN_ON_ONCE_PAGE(PageUnevictable(page), page);
|
||||
+
|
||||
+ /* for migrate_page_states() */
|
||||
+ flags = !reclaiming && lru_gen_is_active(lruvec, gen) ? BIT(PG_active) : 0;
|
||||
+ flags = set_mask_bits(&page->flags, LRU_GEN_MASK, flags);
|
||||
+ gen = ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
|
||||
+
|
||||
+ lru_gen_update_size(lruvec, page, gen, -1);
|
||||
+ list_del(&page->lru);
|
||||
+
|
||||
+ return true;
|
||||
+}
|
||||
+
|
||||
+#else /* !CONFIG_LRU_GEN */
|
||||
+
|
||||
+static inline bool lru_gen_enabled(void)
|
||||
+{
|
||||
+ return false;
|
||||
+}
|
||||
+
|
||||
+static inline bool lru_gen_in_fault(void)
|
||||
+{
|
||||
+ return false;
|
||||
+}
|
||||
+
|
||||
+static inline bool lru_gen_add_page(struct lruvec *lruvec, struct page *page, bool reclaiming)
|
||||
+{
|
||||
+ return false;
|
||||
+}
|
||||
+
|
||||
+static inline bool lru_gen_del_page(struct lruvec *lruvec, struct page *page, bool reclaiming)
|
||||
+{
|
||||
+ return false;
|
||||
+}
|
||||
+
|
||||
+#endif /* CONFIG_LRU_GEN */
|
||||
+
|
||||
static __always_inline void add_page_to_lru_list(struct page *page,
|
||||
struct lruvec *lruvec)
|
||||
{
|
||||
enum lru_list lru = page_lru(page);
|
||||
|
||||
+ if (lru_gen_add_page(lruvec, page, false))
|
||||
+ return;
|
||||
+
|
||||
update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page));
|
||||
list_add(&page->lru, &lruvec->lists[lru]);
|
||||
}
|
||||
@@ -100,6 +269,9 @@ static __always_inline void add_page_to_
|
||||
{
|
||||
enum lru_list lru = page_lru(page);
|
||||
|
||||
+ if (lru_gen_add_page(lruvec, page, true))
|
||||
+ return;
|
||||
+
|
||||
update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page));
|
||||
list_add_tail(&page->lru, &lruvec->lists[lru]);
|
||||
}
|
||||
@@ -107,6 +279,9 @@ static __always_inline void add_page_to_
|
||||
static __always_inline void del_page_from_lru_list(struct page *page,
|
||||
struct lruvec *lruvec)
|
||||
{
|
||||
+ if (lru_gen_del_page(lruvec, page, false))
|
||||
+ return;
|
||||
+
|
||||
list_del(&page->lru);
|
||||
update_lru_size(lruvec, page_lru(page), page_zonenum(page),
|
||||
-thp_nr_pages(page));
|
||||
--- a/include/linux/mmzone.h
|
||||
+++ b/include/linux/mmzone.h
|
||||
@@ -294,6 +294,102 @@ enum lruvec_flags {
|
||||
*/
|
||||
};
|
||||
|
||||
+#endif /* !__GENERATING_BOUNDS_H */
|
||||
+
|
||||
+/*
|
||||
+ * Evictable pages are divided into multiple generations. The youngest and the
|
||||
+ * oldest generation numbers, max_seq and min_seq, are monotonically increasing.
|
||||
+ * They form a sliding window of a variable size [MIN_NR_GENS, MAX_NR_GENS]. An
|
||||
+ * offset within MAX_NR_GENS, i.e., gen, indexes the LRU list of the
|
||||
+ * corresponding generation. The gen counter in page->flags stores gen+1 while
|
||||
+ * a page is on one of lrugen->lists[]. Otherwise it stores 0.
|
||||
+ *
|
||||
+ * A page is added to the youngest generation on faulting. The aging needs to
|
||||
+ * check the accessed bit at least twice before handing this page over to the
|
||||
+ * eviction. The first check takes care of the accessed bit set on the initial
|
||||
+ * fault; the second check makes sure this page hasn't been used since then.
|
||||
+ * This process, AKA second chance, requires a minimum of two generations,
|
||||
+ * hence MIN_NR_GENS. And to maintain ABI compatibility with the active/inactive
|
||||
+ * LRU, e.g., /proc/vmstat, these two generations are considered active; the
|
||||
+ * rest of generations, if they exist, are considered inactive. See
|
||||
+ * lru_gen_is_active().
|
||||
+ *
|
||||
+ * PG_active is always cleared while a page is on one of lrugen->lists[] so that
|
||||
+ * the aging needs not to worry about it. And it's set again when a page
|
||||
+ * considered active is isolated for non-reclaiming purposes, e.g., migration.
|
||||
+ * See lru_gen_add_page() and lru_gen_del_page().
|
||||
+ *
|
||||
+ * MAX_NR_GENS is set to 4 so that the multi-gen LRU can support twice the
|
||||
+ * number of categories of the active/inactive LRU when keeping track of
|
||||
+ * accesses through page tables. This requires order_base_2(MAX_NR_GENS+1) bits
|
||||
+ * in page->flags.
|
||||
+ */
|
||||
+#define MIN_NR_GENS 2U
|
||||
+#define MAX_NR_GENS 4U
|
||||
+
|
||||
+#ifndef __GENERATING_BOUNDS_H
|
||||
+
|
||||
+struct lruvec;
|
||||
+
|
||||
+#define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF)
|
||||
+#define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF)
|
||||
+
|
||||
+#ifdef CONFIG_LRU_GEN
|
||||
+
|
||||
+enum {
|
||||
+ LRU_GEN_ANON,
|
||||
+ LRU_GEN_FILE,
|
||||
+};
|
||||
+
|
||||
+/*
|
||||
+ * The youngest generation number is stored in max_seq for both anon and file
|
||||
+ * types as they are aged on an equal footing. The oldest generation numbers are
|
||||
+ * stored in min_seq[] separately for anon and file types as clean file pages
|
||||
+ * can be evicted regardless of swap constraints.
|
||||
+ *
|
||||
+ * Normally anon and file min_seq are in sync. But if swapping is constrained,
|
||||
+ * e.g., out of swap space, file min_seq is allowed to advance and leave anon
|
||||
+ * min_seq behind.
|
||||
+ *
|
||||
+ * The number of pages in each generation is eventually consistent and therefore
|
||||
+ * can be transiently negative.
|
||||
+ */
|
||||
+struct lru_gen_struct {
|
||||
+ /* the aging increments the youngest generation number */
|
||||
+ unsigned long max_seq;
|
||||
+ /* the eviction increments the oldest generation numbers */
|
||||
+ unsigned long min_seq[ANON_AND_FILE];
|
||||
+ /* the multi-gen LRU lists, lazily sorted on eviction */
|
||||
+ struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
|
||||
+ /* the multi-gen LRU sizes, eventually consistent */
|
||||
+ long nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
|
||||
+};
|
||||
+
|
||||
+void lru_gen_init_lruvec(struct lruvec *lruvec);
|
||||
+
|
||||
+#ifdef CONFIG_MEMCG
|
||||
+void lru_gen_init_memcg(struct mem_cgroup *memcg);
|
||||
+void lru_gen_exit_memcg(struct mem_cgroup *memcg);
|
||||
+#endif
|
||||
+
|
||||
+#else /* !CONFIG_LRU_GEN */
|
||||
+
|
||||
+static inline void lru_gen_init_lruvec(struct lruvec *lruvec)
|
||||
+{
|
||||
+}
|
||||
+
|
||||
+#ifdef CONFIG_MEMCG
|
||||
+static inline void lru_gen_init_memcg(struct mem_cgroup *memcg)
|
||||
+{
|
||||
+}
|
||||
+
|
||||
+static inline void lru_gen_exit_memcg(struct mem_cgroup *memcg)
|
||||
+{
|
||||
+}
|
||||
+#endif
|
||||
+
|
||||
+#endif /* CONFIG_LRU_GEN */
|
||||
+
|
||||
struct lruvec {
|
||||
struct list_head lists[NR_LRU_LISTS];
|
||||
/* per lruvec lru_lock for memcg */
|
||||
@@ -311,6 +407,10 @@ struct lruvec {
|
||||
unsigned long refaults[ANON_AND_FILE];
|
||||
/* Various lruvec state flags (enum lruvec_flags) */
|
||||
unsigned long flags;
|
||||
+#ifdef CONFIG_LRU_GEN
|
||||
+ /* evictable pages divided into generations */
|
||||
+ struct lru_gen_struct lrugen;
|
||||
+#endif
|
||||
#ifdef CONFIG_MEMCG
|
||||
struct pglist_data *pgdat;
|
||||
#endif
|
||||
--- a/include/linux/page-flags-layout.h
|
||||
+++ b/include/linux/page-flags-layout.h
|
||||
@@ -55,7 +55,8 @@
|
||||
#define SECTIONS_WIDTH 0
|
||||
#endif
|
||||
|
||||
-#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS
|
||||
+#if ZONES_WIDTH + LRU_GEN_WIDTH + SECTIONS_WIDTH + NODES_SHIFT \
|
||||
+ <= BITS_PER_LONG - NR_PAGEFLAGS
|
||||
#define NODES_WIDTH NODES_SHIFT
|
||||
#elif defined(CONFIG_SPARSEMEM_VMEMMAP)
|
||||
#error "Vmemmap: No space for nodes field in page flags"
|
||||
@@ -89,8 +90,8 @@
|
||||
#define LAST_CPUPID_SHIFT 0
|
||||
#endif
|
||||
|
||||
-#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + KASAN_TAG_WIDTH + LAST_CPUPID_SHIFT \
|
||||
- <= BITS_PER_LONG - NR_PAGEFLAGS
|
||||
+#if ZONES_WIDTH + LRU_GEN_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + \
|
||||
+ KASAN_TAG_WIDTH + LAST_CPUPID_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS
|
||||
#define LAST_CPUPID_WIDTH LAST_CPUPID_SHIFT
|
||||
#else
|
||||
#define LAST_CPUPID_WIDTH 0
|
||||
@@ -100,10 +101,12 @@
|
||||
#define LAST_CPUPID_NOT_IN_PAGE_FLAGS
|
||||
#endif
|
||||
|
||||
-#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + KASAN_TAG_WIDTH + LAST_CPUPID_WIDTH \
|
||||
- > BITS_PER_LONG - NR_PAGEFLAGS
|
||||
+#if ZONES_WIDTH + LRU_GEN_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + \
|
||||
+ KASAN_TAG_WIDTH + LAST_CPUPID_WIDTH > BITS_PER_LONG - NR_PAGEFLAGS
|
||||
#error "Not enough bits in page flags"
|
||||
#endif
|
||||
|
||||
+#define LRU_REFS_WIDTH 0
|
||||
+
|
||||
#endif
|
||||
#endif /* _LINUX_PAGE_FLAGS_LAYOUT */
|
||||
--- a/include/linux/page-flags.h
|
||||
+++ b/include/linux/page-flags.h
|
||||
@@ -845,7 +845,7 @@ static inline void ClearPageSlabPfmemall
|
||||
1UL << PG_private | 1UL << PG_private_2 | \
|
||||
1UL << PG_writeback | 1UL << PG_reserved | \
|
||||
1UL << PG_slab | 1UL << PG_active | \
|
||||
- 1UL << PG_unevictable | __PG_MLOCKED)
|
||||
+ 1UL << PG_unevictable | __PG_MLOCKED | LRU_GEN_MASK)
|
||||
|
||||
/*
|
||||
* Flags checked when a page is prepped for return by the page allocator.
|
||||
@@ -856,7 +856,7 @@ static inline void ClearPageSlabPfmemall
|
||||
* alloc-free cycle to prevent from reusing the page.
|
||||
*/
|
||||
#define PAGE_FLAGS_CHECK_AT_PREP \
|
||||
- (PAGEFLAGS_MASK & ~__PG_HWPOISON)
|
||||
+ ((PAGEFLAGS_MASK & ~__PG_HWPOISON) | LRU_GEN_MASK | LRU_REFS_MASK)
|
||||
|
||||
#define PAGE_FLAGS_PRIVATE \
|
||||
(1UL << PG_private | 1UL << PG_private_2)
|
||||
--- a/include/linux/sched.h
|
||||
+++ b/include/linux/sched.h
|
||||
@@ -911,6 +911,10 @@ struct task_struct {
|
||||
#ifdef CONFIG_MEMCG
|
||||
unsigned in_user_fault:1;
|
||||
#endif
|
||||
+#ifdef CONFIG_LRU_GEN
|
||||
+ /* whether the LRU algorithm may apply to this access */
|
||||
+ unsigned in_lru_fault:1;
|
||||
+#endif
|
||||
#ifdef CONFIG_COMPAT_BRK
|
||||
unsigned brk_randomized:1;
|
||||
#endif
|
||||
--- a/kernel/bounds.c
|
||||
+++ b/kernel/bounds.c
|
||||
@@ -22,6 +22,11 @@ int main(void)
|
||||
DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS));
|
||||
#endif
|
||||
DEFINE(SPINLOCK_SIZE, sizeof(spinlock_t));
|
||||
+#ifdef CONFIG_LRU_GEN
|
||||
+ DEFINE(LRU_GEN_WIDTH, order_base_2(MAX_NR_GENS + 1));
|
||||
+#else
|
||||
+ DEFINE(LRU_GEN_WIDTH, 0);
|
||||
+#endif
|
||||
/* End of constants */
|
||||
|
||||
return 0;
|
||||
--- a/mm/Kconfig
|
||||
+++ b/mm/Kconfig
|
||||
@@ -897,6 +897,14 @@ config IO_MAPPING
|
||||
config SECRETMEM
|
||||
def_bool ARCH_HAS_SET_DIRECT_MAP && !EMBEDDED
|
||||
|
||||
+config LRU_GEN
|
||||
+ bool "Multi-Gen LRU"
|
||||
+ depends on MMU
|
||||
+ # make sure page->flags has enough spare bits
|
||||
+ depends on 64BIT || !SPARSEMEM || SPARSEMEM_VMEMMAP
|
||||
+ help
|
||||
+ A high performance LRU implementation to overcommit memory.
|
||||
+
|
||||
source "mm/damon/Kconfig"
|
||||
|
||||
endmenu
|
||||
--- a/mm/huge_memory.c
|
||||
+++ b/mm/huge_memory.c
|
||||
@@ -2366,7 +2366,8 @@ static void __split_huge_page_tail(struc
|
||||
#ifdef CONFIG_64BIT
|
||||
(1L << PG_arch_2) |
|
||||
#endif
|
||||
- (1L << PG_dirty)));
|
||||
+ (1L << PG_dirty) |
|
||||
+ LRU_GEN_MASK | LRU_REFS_MASK));
|
||||
|
||||
/* ->mapping in first tail page is compound_mapcount */
|
||||
VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
|
||||
--- a/mm/memcontrol.c
|
||||
+++ b/mm/memcontrol.c
|
||||
@@ -5178,6 +5178,7 @@ static void __mem_cgroup_free(struct mem
|
||||
|
||||
static void mem_cgroup_free(struct mem_cgroup *memcg)
|
||||
{
|
||||
+ lru_gen_exit_memcg(memcg);
|
||||
memcg_wb_domain_exit(memcg);
|
||||
__mem_cgroup_free(memcg);
|
||||
}
|
||||
@@ -5241,6 +5242,7 @@ static struct mem_cgroup *mem_cgroup_all
|
||||
memcg->deferred_split_queue.split_queue_len = 0;
|
||||
#endif
|
||||
idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
|
||||
+ lru_gen_init_memcg(memcg);
|
||||
return memcg;
|
||||
fail:
|
||||
mem_cgroup_id_remove(memcg);
|
||||
--- a/mm/memory.c
|
||||
+++ b/mm/memory.c
|
||||
@@ -4792,6 +4792,27 @@ static inline void mm_account_fault(stru
|
||||
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address);
|
||||
}
|
||||
|
||||
+#ifdef CONFIG_LRU_GEN
|
||||
+static void lru_gen_enter_fault(struct vm_area_struct *vma)
|
||||
+{
|
||||
+ /* the LRU algorithm doesn't apply to sequential or random reads */
|
||||
+ current->in_lru_fault = !(vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ));
|
||||
+}
|
||||
+
|
||||
+static void lru_gen_exit_fault(void)
|
||||
+{
|
||||
+ current->in_lru_fault = false;
|
||||
+}
|
||||
+#else
|
||||
+static void lru_gen_enter_fault(struct vm_area_struct *vma)
|
||||
+{
|
||||
+}
|
||||
+
|
||||
+static void lru_gen_exit_fault(void)
|
||||
+{
|
||||
+}
|
||||
+#endif /* CONFIG_LRU_GEN */
|
||||
+
|
||||
/*
|
||||
* By the time we get here, we already hold the mm semaphore
|
||||
*
|
||||
@@ -4823,11 +4844,15 @@ vm_fault_t handle_mm_fault(struct vm_are
|
||||
if (flags & FAULT_FLAG_USER)
|
||||
mem_cgroup_enter_user_fault();
|
||||
|
||||
+ lru_gen_enter_fault(vma);
|
||||
+
|
||||
if (unlikely(is_vm_hugetlb_page(vma)))
|
||||
ret = hugetlb_fault(vma->vm_mm, vma, address, flags);
|
||||
else
|
||||
ret = __handle_mm_fault(vma, address, flags);
|
||||
|
||||
+ lru_gen_exit_fault();
|
||||
+
|
||||
if (flags & FAULT_FLAG_USER) {
|
||||
mem_cgroup_exit_user_fault();
|
||||
/*
|
||||
--- a/mm/mm_init.c
|
||||
+++ b/mm/mm_init.c
|
||||
@@ -65,14 +65,16 @@ void __init mminit_verify_pageflags_layo
|
||||
|
||||
shift = 8 * sizeof(unsigned long);
|
||||
width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH
|
||||
- - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH;
|
||||
+ - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH - LRU_GEN_WIDTH - LRU_REFS_WIDTH;
|
||||
mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths",
|
||||
- "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Flags %d\n",
|
||||
+ "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Gen %d Tier %d Flags %d\n",
|
||||
SECTIONS_WIDTH,
|
||||
NODES_WIDTH,
|
||||
ZONES_WIDTH,
|
||||
LAST_CPUPID_WIDTH,
|
||||
KASAN_TAG_WIDTH,
|
||||
+ LRU_GEN_WIDTH,
|
||||
+ LRU_REFS_WIDTH,
|
||||
NR_PAGEFLAGS);
|
||||
mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts",
|
||||
"Section %d Node %d Zone %d Lastcpupid %d Kasantag %d\n",
|
||||
--- a/mm/mmzone.c
|
||||
+++ b/mm/mmzone.c
|
||||
@@ -81,6 +81,8 @@ void lruvec_init(struct lruvec *lruvec)
|
||||
|
||||
for_each_lru(lru)
|
||||
INIT_LIST_HEAD(&lruvec->lists[lru]);
|
||||
+
|
||||
+ lru_gen_init_lruvec(lruvec);
|
||||
}
|
||||
|
||||
#if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS)
|
||||
--- a/mm/swap.c
|
||||
+++ b/mm/swap.c
|
||||
@@ -446,6 +446,11 @@ void lru_cache_add(struct page *page)
|
||||
VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page);
|
||||
VM_BUG_ON_PAGE(PageLRU(page), page);
|
||||
|
||||
+ /* see the comment in lru_gen_add_page() */
|
||||
+ if (lru_gen_enabled() && !PageUnevictable(page) &&
|
||||
+ lru_gen_in_fault() && !(current->flags & PF_MEMALLOC))
|
||||
+ SetPageActive(page);
|
||||
+
|
||||
get_page(page);
|
||||
local_lock(&lru_pvecs.lock);
|
||||
pvec = this_cpu_ptr(&lru_pvecs.lru_add);
|
||||
@@ -547,7 +552,7 @@ static void lru_deactivate_file_fn(struc
|
||||
|
||||
static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec)
|
||||
{
|
||||
- if (PageActive(page) && !PageUnevictable(page)) {
|
||||
+ if (!PageUnevictable(page) && (PageActive(page) || lru_gen_enabled())) {
|
||||
int nr_pages = thp_nr_pages(page);
|
||||
|
||||
del_page_from_lru_list(page, lruvec);
|
||||
@@ -661,7 +666,8 @@ void deactivate_file_page(struct page *p
|
||||
*/
|
||||
void deactivate_page(struct page *page)
|
||||
{
|
||||
- if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
|
||||
+ if (PageLRU(page) && !PageUnevictable(page) &&
|
||||
+ (PageActive(page) || lru_gen_enabled())) {
|
||||
struct pagevec *pvec;
|
||||
|
||||
local_lock(&lru_pvecs.lock);
|
||||
--- a/mm/vmscan.c
|
||||
+++ b/mm/vmscan.c
|
||||
@@ -2821,6 +2821,81 @@ static bool can_age_anon_pages(struct pg
|
||||
return can_demote(pgdat->node_id, sc);
|
||||
}
|
||||
|
||||
+#ifdef CONFIG_LRU_GEN
|
||||
+
|
||||
+/******************************************************************************
|
||||
+ * shorthand helpers
|
||||
+ ******************************************************************************/
|
||||
+
|
||||
+#define for_each_gen_type_zone(gen, type, zone) \
|
||||
+ for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++) \
|
||||
+ for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \
|
||||
+ for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++)
|
||||
+
|
||||
+static struct lruvec __maybe_unused *get_lruvec(struct mem_cgroup *memcg, int nid)
|
||||
+{
|
||||
+ struct pglist_data *pgdat = NODE_DATA(nid);
|
||||
+
|
||||
+#ifdef CONFIG_MEMCG
|
||||
+ if (memcg) {
|
||||
+ struct lruvec *lruvec = &memcg->nodeinfo[nid]->lruvec;
|
||||
+
|
||||
+ /* for hotadd_new_pgdat() */
|
||||
+ if (!lruvec->pgdat)
|
||||
+ lruvec->pgdat = pgdat;
|
||||
+
|
||||
+ return lruvec;
|
||||
+ }
|
||||
+#endif
|
||||
+ VM_WARN_ON_ONCE(!mem_cgroup_disabled());
|
||||
+
|
||||
+ return pgdat ? &pgdat->__lruvec : NULL;
|
||||
+}
|
||||
+
|
||||
+/******************************************************************************
|
||||
+ * initialization
|
||||
+ ******************************************************************************/
|
||||
+
|
||||
+void lru_gen_init_lruvec(struct lruvec *lruvec)
|
||||
+{
|
||||
+ int gen, type, zone;
|
||||
+ struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
||||
+
|
||||
+ lrugen->max_seq = MIN_NR_GENS + 1;
|
||||
+
|
||||
+ for_each_gen_type_zone(gen, type, zone)
|
||||
+ INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]);
|
||||
+}
|
||||
+
|
||||
+#ifdef CONFIG_MEMCG
|
||||
+void lru_gen_init_memcg(struct mem_cgroup *memcg)
|
||||
+{
|
||||
+}
|
||||
+
|
||||
+void lru_gen_exit_memcg(struct mem_cgroup *memcg)
|
||||
+{
|
||||
+ int nid;
|
||||
+
|
||||
+ for_each_node(nid) {
|
||||
+ struct lruvec *lruvec = get_lruvec(memcg, nid);
|
||||
+
|
||||
+ VM_WARN_ON_ONCE(memchr_inv(lruvec->lrugen.nr_pages, 0,
|
||||
+ sizeof(lruvec->lrugen.nr_pages)));
|
||||
+ }
|
||||
+}
|
||||
+#endif
|
||||
+
|
||||
+static int __init init_lru_gen(void)
|
||||
+{
|
||||
+ BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS);
|
||||
+ BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS);
|
||||
+
|
||||
+ return 0;
|
||||
+};
|
||||
+late_initcall(init_lru_gen);
|
||||
+
|
||||
+#endif /* CONFIG_LRU_GEN */
|
||||
+
|
||||
static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
|
||||
{
|
||||
unsigned long nr[NR_LRU_LISTS];
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,491 @@
|
||||
From e4277535f6d6708bb19b88c4bad155832671d69b Mon Sep 17 00:00:00 2001
|
||||
From: Yu Zhao <yuzhao@google.com>
|
||||
Date: Sun, 18 Sep 2022 02:00:04 -0600
|
||||
Subject: [PATCH 07/29] mm: multi-gen LRU: exploit locality in rmap
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
Searching the rmap for PTEs mapping each page on an LRU list (to test and
|
||||
clear the accessed bit) can be expensive because pages from different VMAs
|
||||
(PA space) are not cache friendly to the rmap (VA space). For workloads
|
||||
mostly using mapped pages, searching the rmap can incur the highest CPU
|
||||
cost in the reclaim path.
|
||||
|
||||
This patch exploits spatial locality to reduce the trips into the rmap.
|
||||
When shrink_page_list() walks the rmap and finds a young PTE, a new
|
||||
function lru_gen_look_around() scans at most BITS_PER_LONG-1 adjacent
|
||||
PTEs. On finding another young PTE, it clears the accessed bit and
|
||||
updates the gen counter of the page mapped by this PTE to
|
||||
(max_seq%MAX_NR_GENS)+1.
|
||||
|
||||
Server benchmark results:
|
||||
Single workload:
|
||||
fio (buffered I/O): no change
|
||||
|
||||
Single workload:
|
||||
memcached (anon): +[3, 5]%
|
||||
Ops/sec KB/sec
|
||||
patch1-6: 1106168.46 43025.04
|
||||
patch1-7: 1147696.57 44640.29
|
||||
|
||||
Configurations:
|
||||
no change
|
||||
|
||||
Client benchmark results:
|
||||
kswapd profiles:
|
||||
patch1-6
|
||||
39.03% lzo1x_1_do_compress (real work)
|
||||
18.47% page_vma_mapped_walk (overhead)
|
||||
6.74% _raw_spin_unlock_irq
|
||||
3.97% do_raw_spin_lock
|
||||
2.49% ptep_clear_flush
|
||||
2.48% anon_vma_interval_tree_iter_first
|
||||
1.92% page_referenced_one
|
||||
1.88% __zram_bvec_write
|
||||
1.48% memmove
|
||||
1.31% vma_interval_tree_iter_next
|
||||
|
||||
patch1-7
|
||||
48.16% lzo1x_1_do_compress (real work)
|
||||
8.20% page_vma_mapped_walk (overhead)
|
||||
7.06% _raw_spin_unlock_irq
|
||||
2.92% ptep_clear_flush
|
||||
2.53% __zram_bvec_write
|
||||
2.11% do_raw_spin_lock
|
||||
2.02% memmove
|
||||
1.93% lru_gen_look_around
|
||||
1.56% free_unref_page_list
|
||||
1.40% memset
|
||||
|
||||
Configurations:
|
||||
no change
|
||||
|
||||
Link: https://lkml.kernel.org/r/20220918080010.2920238-8-yuzhao@google.com
|
||||
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
||||
Acked-by: Barry Song <baohua@kernel.org>
|
||||
Acked-by: Brian Geffon <bgeffon@google.com>
|
||||
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
|
||||
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
|
||||
Acked-by: Steven Barrett <steven@liquorix.net>
|
||||
Acked-by: Suleiman Souhlal <suleiman@google.com>
|
||||
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
|
||||
Tested-by: Donald Carr <d@chaos-reins.com>
|
||||
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
|
||||
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
|
||||
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
|
||||
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
|
||||
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
|
||||
Cc: Andi Kleen <ak@linux.intel.com>
|
||||
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
|
||||
Cc: Catalin Marinas <catalin.marinas@arm.com>
|
||||
Cc: Dave Hansen <dave.hansen@linux.intel.com>
|
||||
Cc: Hillf Danton <hdanton@sina.com>
|
||||
Cc: Jens Axboe <axboe@kernel.dk>
|
||||
Cc: Johannes Weiner <hannes@cmpxchg.org>
|
||||
Cc: Jonathan Corbet <corbet@lwn.net>
|
||||
Cc: Linus Torvalds <torvalds@linux-foundation.org>
|
||||
Cc: Matthew Wilcox <willy@infradead.org>
|
||||
Cc: Mel Gorman <mgorman@suse.de>
|
||||
Cc: Miaohe Lin <linmiaohe@huawei.com>
|
||||
Cc: Michael Larabel <Michael@MichaelLarabel.com>
|
||||
Cc: Michal Hocko <mhocko@kernel.org>
|
||||
Cc: Mike Rapoport <rppt@kernel.org>
|
||||
Cc: Mike Rapoport <rppt@linux.ibm.com>
|
||||
Cc: Peter Zijlstra <peterz@infradead.org>
|
||||
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
|
||||
Cc: Tejun Heo <tj@kernel.org>
|
||||
Cc: Vlastimil Babka <vbabka@suse.cz>
|
||||
Cc: Will Deacon <will@kernel.org>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
include/linux/memcontrol.h | 31 +++++++
|
||||
include/linux/mmzone.h | 6 ++
|
||||
mm/internal.h | 1 +
|
||||
mm/memcontrol.c | 1 +
|
||||
mm/rmap.c | 7 ++
|
||||
mm/swap.c | 4 +-
|
||||
mm/vmscan.c | 184 +++++++++++++++++++++++++++++++++++++
|
||||
7 files changed, 232 insertions(+), 2 deletions(-)
|
||||
|
||||
--- a/include/linux/memcontrol.h
|
||||
+++ b/include/linux/memcontrol.h
|
||||
@@ -442,6 +442,7 @@ static inline struct obj_cgroup *__page_
|
||||
* - LRU isolation
|
||||
* - lock_page_memcg()
|
||||
* - exclusive reference
|
||||
+ * - mem_cgroup_trylock_pages()
|
||||
*
|
||||
* For a kmem page a caller should hold an rcu read lock to protect memcg
|
||||
* associated with a kmem page from being released.
|
||||
@@ -497,6 +498,7 @@ static inline struct mem_cgroup *page_me
|
||||
* - LRU isolation
|
||||
* - lock_page_memcg()
|
||||
* - exclusive reference
|
||||
+ * - mem_cgroup_trylock_pages()
|
||||
*
|
||||
* For a kmem page a caller should hold an rcu read lock to protect memcg
|
||||
* associated with a kmem page from being released.
|
||||
@@ -953,6 +955,23 @@ void unlock_page_memcg(struct page *page
|
||||
|
||||
void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val);
|
||||
|
||||
+/* try to stablize page_memcg() for all the pages in a memcg */
|
||||
+static inline bool mem_cgroup_trylock_pages(struct mem_cgroup *memcg)
|
||||
+{
|
||||
+ rcu_read_lock();
|
||||
+
|
||||
+ if (mem_cgroup_disabled() || !atomic_read(&memcg->moving_account))
|
||||
+ return true;
|
||||
+
|
||||
+ rcu_read_unlock();
|
||||
+ return false;
|
||||
+}
|
||||
+
|
||||
+static inline void mem_cgroup_unlock_pages(void)
|
||||
+{
|
||||
+ rcu_read_unlock();
|
||||
+}
|
||||
+
|
||||
/* idx can be of type enum memcg_stat_item or node_stat_item */
|
||||
static inline void mod_memcg_state(struct mem_cgroup *memcg,
|
||||
int idx, int val)
|
||||
@@ -1369,6 +1388,18 @@ static inline void unlock_page_memcg(str
|
||||
{
|
||||
}
|
||||
|
||||
+static inline bool mem_cgroup_trylock_pages(struct mem_cgroup *memcg)
|
||||
+{
|
||||
+ /* to match page_memcg_rcu() */
|
||||
+ rcu_read_lock();
|
||||
+ return true;
|
||||
+}
|
||||
+
|
||||
+static inline void mem_cgroup_unlock_pages(void)
|
||||
+{
|
||||
+ rcu_read_unlock();
|
||||
+}
|
||||
+
|
||||
static inline void mem_cgroup_handle_over_high(void)
|
||||
{
|
||||
}
|
||||
--- a/include/linux/mmzone.h
|
||||
+++ b/include/linux/mmzone.h
|
||||
@@ -352,6 +352,7 @@ enum lruvec_flags {
|
||||
#ifndef __GENERATING_BOUNDS_H
|
||||
|
||||
struct lruvec;
|
||||
+struct page_vma_mapped_walk;
|
||||
|
||||
#define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF)
|
||||
#define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF)
|
||||
@@ -407,6 +408,7 @@ struct lru_gen_struct {
|
||||
};
|
||||
|
||||
void lru_gen_init_lruvec(struct lruvec *lruvec);
|
||||
+void lru_gen_look_around(struct page_vma_mapped_walk *pvmw);
|
||||
|
||||
#ifdef CONFIG_MEMCG
|
||||
void lru_gen_init_memcg(struct mem_cgroup *memcg);
|
||||
@@ -419,6 +421,10 @@ static inline void lru_gen_init_lruvec(s
|
||||
{
|
||||
}
|
||||
|
||||
+static inline void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
|
||||
+{
|
||||
+}
|
||||
+
|
||||
#ifdef CONFIG_MEMCG
|
||||
static inline void lru_gen_init_memcg(struct mem_cgroup *memcg)
|
||||
{
|
||||
--- a/mm/internal.h
|
||||
+++ b/mm/internal.h
|
||||
@@ -35,6 +35,7 @@
|
||||
void page_writeback_init(void);
|
||||
|
||||
vm_fault_t do_swap_page(struct vm_fault *vmf);
|
||||
+void activate_page(struct page *page);
|
||||
|
||||
void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
|
||||
unsigned long floor, unsigned long ceiling);
|
||||
--- a/mm/memcontrol.c
|
||||
+++ b/mm/memcontrol.c
|
||||
@@ -2798,6 +2798,7 @@ static void commit_charge(struct page *p
|
||||
* - LRU isolation
|
||||
* - lock_page_memcg()
|
||||
* - exclusive reference
|
||||
+ * - mem_cgroup_trylock_pages()
|
||||
*/
|
||||
page->memcg_data = (unsigned long)memcg;
|
||||
}
|
||||
--- a/mm/rmap.c
|
||||
+++ b/mm/rmap.c
|
||||
@@ -73,6 +73,7 @@
|
||||
#include <linux/page_idle.h>
|
||||
#include <linux/memremap.h>
|
||||
#include <linux/userfaultfd_k.h>
|
||||
+#include <linux/mm_inline.h>
|
||||
|
||||
#include <asm/tlbflush.h>
|
||||
|
||||
@@ -793,6 +794,12 @@ static bool page_referenced_one(struct p
|
||||
}
|
||||
|
||||
if (pvmw.pte) {
|
||||
+ if (lru_gen_enabled() && pte_young(*pvmw.pte) &&
|
||||
+ !(vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ))) {
|
||||
+ lru_gen_look_around(&pvmw);
|
||||
+ referenced++;
|
||||
+ }
|
||||
+
|
||||
if (ptep_clear_flush_young_notify(vma, address,
|
||||
pvmw.pte)) {
|
||||
/*
|
||||
--- a/mm/swap.c
|
||||
+++ b/mm/swap.c
|
||||
@@ -325,7 +325,7 @@ static bool need_activate_page_drain(int
|
||||
return pagevec_count(&per_cpu(lru_pvecs.activate_page, cpu)) != 0;
|
||||
}
|
||||
|
||||
-static void activate_page(struct page *page)
|
||||
+void activate_page(struct page *page)
|
||||
{
|
||||
page = compound_head(page);
|
||||
if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
|
||||
@@ -345,7 +345,7 @@ static inline void activate_page_drain(i
|
||||
{
|
||||
}
|
||||
|
||||
-static void activate_page(struct page *page)
|
||||
+void activate_page(struct page *page)
|
||||
{
|
||||
struct lruvec *lruvec;
|
||||
|
||||
--- a/mm/vmscan.c
|
||||
+++ b/mm/vmscan.c
|
||||
@@ -1409,6 +1409,11 @@ retry:
|
||||
if (!sc->may_unmap && page_mapped(page))
|
||||
goto keep_locked;
|
||||
|
||||
+ /* page_update_gen() tried to promote this page? */
|
||||
+ if (lru_gen_enabled() && !ignore_references &&
|
||||
+ page_mapped(page) && PageReferenced(page))
|
||||
+ goto keep_locked;
|
||||
+
|
||||
may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
|
||||
(PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
|
||||
|
||||
@@ -2990,6 +2995,29 @@ static bool positive_ctrl_err(struct ctr
|
||||
* the aging
|
||||
******************************************************************************/
|
||||
|
||||
+/* promote pages accessed through page tables */
|
||||
+static int page_update_gen(struct page *page, int gen)
|
||||
+{
|
||||
+ unsigned long new_flags, old_flags = READ_ONCE(page->flags);
|
||||
+
|
||||
+ VM_WARN_ON_ONCE(gen >= MAX_NR_GENS);
|
||||
+ VM_WARN_ON_ONCE(!rcu_read_lock_held());
|
||||
+
|
||||
+ do {
|
||||
+ /* lru_gen_del_page() has isolated this page? */
|
||||
+ if (!(old_flags & LRU_GEN_MASK)) {
|
||||
+ /* for shrink_page_list() */
|
||||
+ new_flags = old_flags | BIT(PG_referenced);
|
||||
+ continue;
|
||||
+ }
|
||||
+
|
||||
+ new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS);
|
||||
+ new_flags |= (gen + 1UL) << LRU_GEN_PGOFF;
|
||||
+ } while (!try_cmpxchg(&page->flags, &old_flags, new_flags));
|
||||
+
|
||||
+ return ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
|
||||
+}
|
||||
+
|
||||
/* protect pages accessed multiple times through file descriptors */
|
||||
static int page_inc_gen(struct lruvec *lruvec, struct page *page, bool reclaiming)
|
||||
{
|
||||
@@ -3001,6 +3029,11 @@ static int page_inc_gen(struct lruvec *l
|
||||
VM_WARN_ON_ONCE_PAGE(!(old_flags & LRU_GEN_MASK), page);
|
||||
|
||||
do {
|
||||
+ new_gen = ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
|
||||
+ /* page_update_gen() has promoted this page? */
|
||||
+ if (new_gen >= 0 && new_gen != old_gen)
|
||||
+ return new_gen;
|
||||
+
|
||||
new_gen = (old_gen + 1) % MAX_NR_GENS;
|
||||
|
||||
new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS);
|
||||
@@ -3015,6 +3048,43 @@ static int page_inc_gen(struct lruvec *l
|
||||
return new_gen;
|
||||
}
|
||||
|
||||
+static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned long addr)
|
||||
+{
|
||||
+ unsigned long pfn = pte_pfn(pte);
|
||||
+
|
||||
+ VM_WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end);
|
||||
+
|
||||
+ if (!pte_present(pte) || is_zero_pfn(pfn))
|
||||
+ return -1;
|
||||
+
|
||||
+ if (WARN_ON_ONCE(pte_devmap(pte) || pte_special(pte)))
|
||||
+ return -1;
|
||||
+
|
||||
+ if (WARN_ON_ONCE(!pfn_valid(pfn)))
|
||||
+ return -1;
|
||||
+
|
||||
+ return pfn;
|
||||
+}
|
||||
+
|
||||
+static struct page *get_pfn_page(unsigned long pfn, struct mem_cgroup *memcg,
|
||||
+ struct pglist_data *pgdat)
|
||||
+{
|
||||
+ struct page *page;
|
||||
+
|
||||
+ /* try to avoid unnecessary memory loads */
|
||||
+ if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat))
|
||||
+ return NULL;
|
||||
+
|
||||
+ page = compound_head(pfn_to_page(pfn));
|
||||
+ if (page_to_nid(page) != pgdat->node_id)
|
||||
+ return NULL;
|
||||
+
|
||||
+ if (page_memcg_rcu(page) != memcg)
|
||||
+ return NULL;
|
||||
+
|
||||
+ return page;
|
||||
+}
|
||||
+
|
||||
static void inc_min_seq(struct lruvec *lruvec, int type)
|
||||
{
|
||||
struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
||||
@@ -3214,6 +3284,114 @@ static void lru_gen_age_node(struct pgli
|
||||
} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
|
||||
}
|
||||
|
||||
+/*
|
||||
+ * This function exploits spatial locality when shrink_page_list() walks the
|
||||
+ * rmap. It scans the adjacent PTEs of a young PTE and promotes hot pages.
|
||||
+ */
|
||||
+void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
|
||||
+{
|
||||
+ int i;
|
||||
+ pte_t *pte;
|
||||
+ unsigned long start;
|
||||
+ unsigned long end;
|
||||
+ unsigned long addr;
|
||||
+ unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)] = {};
|
||||
+ struct page *page = pvmw->page;
|
||||
+ struct mem_cgroup *memcg = page_memcg(page);
|
||||
+ struct pglist_data *pgdat = page_pgdat(page);
|
||||
+ struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
|
||||
+ DEFINE_MAX_SEQ(lruvec);
|
||||
+ int old_gen, new_gen = lru_gen_from_seq(max_seq);
|
||||
+
|
||||
+ lockdep_assert_held(pvmw->ptl);
|
||||
+ VM_WARN_ON_ONCE_PAGE(PageLRU(page), page);
|
||||
+
|
||||
+ if (spin_is_contended(pvmw->ptl))
|
||||
+ return;
|
||||
+
|
||||
+ start = max(pvmw->address & PMD_MASK, pvmw->vma->vm_start);
|
||||
+ end = min(pvmw->address | ~PMD_MASK, pvmw->vma->vm_end - 1) + 1;
|
||||
+
|
||||
+ if (end - start > MIN_LRU_BATCH * PAGE_SIZE) {
|
||||
+ if (pvmw->address - start < MIN_LRU_BATCH * PAGE_SIZE / 2)
|
||||
+ end = start + MIN_LRU_BATCH * PAGE_SIZE;
|
||||
+ else if (end - pvmw->address < MIN_LRU_BATCH * PAGE_SIZE / 2)
|
||||
+ start = end - MIN_LRU_BATCH * PAGE_SIZE;
|
||||
+ else {
|
||||
+ start = pvmw->address - MIN_LRU_BATCH * PAGE_SIZE / 2;
|
||||
+ end = pvmw->address + MIN_LRU_BATCH * PAGE_SIZE / 2;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ pte = pvmw->pte - (pvmw->address - start) / PAGE_SIZE;
|
||||
+
|
||||
+ rcu_read_lock();
|
||||
+ arch_enter_lazy_mmu_mode();
|
||||
+
|
||||
+ for (i = 0, addr = start; addr != end; i++, addr += PAGE_SIZE) {
|
||||
+ unsigned long pfn;
|
||||
+
|
||||
+ pfn = get_pte_pfn(pte[i], pvmw->vma, addr);
|
||||
+ if (pfn == -1)
|
||||
+ continue;
|
||||
+
|
||||
+ if (!pte_young(pte[i]))
|
||||
+ continue;
|
||||
+
|
||||
+ page = get_pfn_page(pfn, memcg, pgdat);
|
||||
+ if (!page)
|
||||
+ continue;
|
||||
+
|
||||
+ if (!ptep_test_and_clear_young(pvmw->vma, addr, pte + i))
|
||||
+ VM_WARN_ON_ONCE(true);
|
||||
+
|
||||
+ if (pte_dirty(pte[i]) && !PageDirty(page) &&
|
||||
+ !(PageAnon(page) && PageSwapBacked(page) &&
|
||||
+ !PageSwapCache(page)))
|
||||
+ set_page_dirty(page);
|
||||
+
|
||||
+ old_gen = page_lru_gen(page);
|
||||
+ if (old_gen < 0)
|
||||
+ SetPageReferenced(page);
|
||||
+ else if (old_gen != new_gen)
|
||||
+ __set_bit(i, bitmap);
|
||||
+ }
|
||||
+
|
||||
+ arch_leave_lazy_mmu_mode();
|
||||
+ rcu_read_unlock();
|
||||
+
|
||||
+ if (bitmap_weight(bitmap, MIN_LRU_BATCH) < PAGEVEC_SIZE) {
|
||||
+ for_each_set_bit(i, bitmap, MIN_LRU_BATCH) {
|
||||
+ page = pte_page(pte[i]);
|
||||
+ activate_page(page);
|
||||
+ }
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
+ /* page_update_gen() requires stable page_memcg() */
|
||||
+ if (!mem_cgroup_trylock_pages(memcg))
|
||||
+ return;
|
||||
+
|
||||
+ spin_lock_irq(&lruvec->lru_lock);
|
||||
+ new_gen = lru_gen_from_seq(lruvec->lrugen.max_seq);
|
||||
+
|
||||
+ for_each_set_bit(i, bitmap, MIN_LRU_BATCH) {
|
||||
+ page = compound_head(pte_page(pte[i]));
|
||||
+ if (page_memcg_rcu(page) != memcg)
|
||||
+ continue;
|
||||
+
|
||||
+ old_gen = page_update_gen(page, new_gen);
|
||||
+ if (old_gen < 0 || old_gen == new_gen)
|
||||
+ continue;
|
||||
+
|
||||
+ lru_gen_update_size(lruvec, page, old_gen, new_gen);
|
||||
+ }
|
||||
+
|
||||
+ spin_unlock_irq(&lruvec->lru_lock);
|
||||
+
|
||||
+ mem_cgroup_unlock_pages();
|
||||
+}
|
||||
+
|
||||
/******************************************************************************
|
||||
* the eviction
|
||||
******************************************************************************/
|
||||
@@ -3250,6 +3428,12 @@ static bool sort_page(struct lruvec *lru
|
||||
return true;
|
||||
}
|
||||
|
||||
+ /* promoted */
|
||||
+ if (gen != lru_gen_from_seq(lrugen->min_seq[type])) {
|
||||
+ list_move(&page->lru, &lrugen->lists[gen][type][zone]);
|
||||
+ return true;
|
||||
+ }
|
||||
+
|
||||
/* protected */
|
||||
if (tier > tier_idx) {
|
||||
int hist = lru_hist_from_seq(lrugen->min_seq[type]);
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,315 @@
|
||||
From 36a18a68ea458e8f4db2ca86b00091daf32c6c74 Mon Sep 17 00:00:00 2001
|
||||
From: Yu Zhao <yuzhao@google.com>
|
||||
Date: Sun, 18 Sep 2022 02:00:06 -0600
|
||||
Subject: [PATCH 09/29] mm: multi-gen LRU: optimize multiple memcgs
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
When multiple memcgs are available, it is possible to use generations as a
|
||||
frame of reference to make better choices and improve overall performance
|
||||
under global memory pressure. This patch adds a basic optimization to
|
||||
select memcgs that can drop single-use unmapped clean pages first. Doing
|
||||
so reduces the chance of going into the aging path or swapping, which can
|
||||
be costly.
|
||||
|
||||
A typical example that benefits from this optimization is a server running
|
||||
mixed types of workloads, e.g., heavy anon workload in one memcg and heavy
|
||||
buffered I/O workload in the other.
|
||||
|
||||
Though this optimization can be applied to both kswapd and direct reclaim,
|
||||
it is only added to kswapd to keep the patchset manageable. Later
|
||||
improvements may cover the direct reclaim path.
|
||||
|
||||
While ensuring certain fairness to all eligible memcgs, proportional scans
|
||||
of individual memcgs also require proper backoff to avoid overshooting
|
||||
their aggregate reclaim target by too much. Otherwise it can cause high
|
||||
direct reclaim latency. The conditions for backoff are:
|
||||
|
||||
1. At low priorities, for direct reclaim, if aging fairness or direct
|
||||
reclaim latency is at risk, i.e., aging one memcg multiple times or
|
||||
swapping after the target is met.
|
||||
2. At high priorities, for global reclaim, if per-zone free pages are
|
||||
above respective watermarks.
|
||||
|
||||
Server benchmark results:
|
||||
Mixed workloads:
|
||||
fio (buffered I/O): +[19, 21]%
|
||||
IOPS BW
|
||||
patch1-8: 1880k 7343MiB/s
|
||||
patch1-9: 2252k 8796MiB/s
|
||||
|
||||
memcached (anon): +[119, 123]%
|
||||
Ops/sec KB/sec
|
||||
patch1-8: 862768.65 33514.68
|
||||
patch1-9: 1911022.12 74234.54
|
||||
|
||||
Mixed workloads:
|
||||
fio (buffered I/O): +[75, 77]%
|
||||
IOPS BW
|
||||
5.19-rc1: 1279k 4996MiB/s
|
||||
patch1-9: 2252k 8796MiB/s
|
||||
|
||||
memcached (anon): +[13, 15]%
|
||||
Ops/sec KB/sec
|
||||
5.19-rc1: 1673524.04 65008.87
|
||||
patch1-9: 1911022.12 74234.54
|
||||
|
||||
Configurations:
|
||||
(changes since patch 6)
|
||||
|
||||
cat mixed.sh
|
||||
modprobe brd rd_nr=2 rd_size=56623104
|
||||
|
||||
swapoff -a
|
||||
mkswap /dev/ram0
|
||||
swapon /dev/ram0
|
||||
|
||||
mkfs.ext4 /dev/ram1
|
||||
mount -t ext4 /dev/ram1 /mnt
|
||||
|
||||
memtier_benchmark -S /var/run/memcached/memcached.sock \
|
||||
-P memcache_binary -n allkeys --key-minimum=1 \
|
||||
--key-maximum=50000000 --key-pattern=P:P -c 1 -t 36 \
|
||||
--ratio 1:0 --pipeline 8 -d 2000
|
||||
|
||||
fio -name=mglru --numjobs=36 --directory=/mnt --size=1408m \
|
||||
--buffered=1 --ioengine=io_uring --iodepth=128 \
|
||||
--iodepth_batch_submit=32 --iodepth_batch_complete=32 \
|
||||
--rw=randread --random_distribution=random --norandommap \
|
||||
--time_based --ramp_time=10m --runtime=90m --group_reporting &
|
||||
pid=$!
|
||||
|
||||
sleep 200
|
||||
|
||||
memtier_benchmark -S /var/run/memcached/memcached.sock \
|
||||
-P memcache_binary -n allkeys --key-minimum=1 \
|
||||
--key-maximum=50000000 --key-pattern=R:R -c 1 -t 36 \
|
||||
--ratio 0:1 --pipeline 8 --randomize --distinct-client-seed
|
||||
|
||||
kill -INT $pid
|
||||
wait
|
||||
|
||||
Client benchmark results:
|
||||
no change (CONFIG_MEMCG=n)
|
||||
|
||||
Link: https://lkml.kernel.org/r/20220918080010.2920238-10-yuzhao@google.com
|
||||
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
||||
Acked-by: Brian Geffon <bgeffon@google.com>
|
||||
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
|
||||
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
|
||||
Acked-by: Steven Barrett <steven@liquorix.net>
|
||||
Acked-by: Suleiman Souhlal <suleiman@google.com>
|
||||
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
|
||||
Tested-by: Donald Carr <d@chaos-reins.com>
|
||||
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
|
||||
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
|
||||
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
|
||||
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
|
||||
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
|
||||
Cc: Andi Kleen <ak@linux.intel.com>
|
||||
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
|
||||
Cc: Barry Song <baohua@kernel.org>
|
||||
Cc: Catalin Marinas <catalin.marinas@arm.com>
|
||||
Cc: Dave Hansen <dave.hansen@linux.intel.com>
|
||||
Cc: Hillf Danton <hdanton@sina.com>
|
||||
Cc: Jens Axboe <axboe@kernel.dk>
|
||||
Cc: Johannes Weiner <hannes@cmpxchg.org>
|
||||
Cc: Jonathan Corbet <corbet@lwn.net>
|
||||
Cc: Linus Torvalds <torvalds@linux-foundation.org>
|
||||
Cc: Matthew Wilcox <willy@infradead.org>
|
||||
Cc: Mel Gorman <mgorman@suse.de>
|
||||
Cc: Miaohe Lin <linmiaohe@huawei.com>
|
||||
Cc: Michael Larabel <Michael@MichaelLarabel.com>
|
||||
Cc: Michal Hocko <mhocko@kernel.org>
|
||||
Cc: Mike Rapoport <rppt@kernel.org>
|
||||
Cc: Mike Rapoport <rppt@linux.ibm.com>
|
||||
Cc: Peter Zijlstra <peterz@infradead.org>
|
||||
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
|
||||
Cc: Tejun Heo <tj@kernel.org>
|
||||
Cc: Vlastimil Babka <vbabka@suse.cz>
|
||||
Cc: Will Deacon <will@kernel.org>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
mm/vmscan.c | 105 +++++++++++++++++++++++++++++++++++++++++++++++-----
|
||||
1 file changed, 96 insertions(+), 9 deletions(-)
|
||||
|
||||
--- a/mm/vmscan.c
|
||||
+++ b/mm/vmscan.c
|
||||
@@ -127,6 +127,12 @@ struct scan_control {
|
||||
/* Always discard instead of demoting to lower tier memory */
|
||||
unsigned int no_demotion:1;
|
||||
|
||||
+#ifdef CONFIG_LRU_GEN
|
||||
+ /* help kswapd make better choices among multiple memcgs */
|
||||
+ unsigned int memcgs_need_aging:1;
|
||||
+ unsigned long last_reclaimed;
|
||||
+#endif
|
||||
+
|
||||
/* Allocation order */
|
||||
s8 order;
|
||||
|
||||
@@ -4202,6 +4208,19 @@ static void lru_gen_age_node(struct pgli
|
||||
|
||||
VM_WARN_ON_ONCE(!current_is_kswapd());
|
||||
|
||||
+ sc->last_reclaimed = sc->nr_reclaimed;
|
||||
+
|
||||
+ /*
|
||||
+ * To reduce the chance of going into the aging path, which can be
|
||||
+ * costly, optimistically skip it if the flag below was cleared in the
|
||||
+ * eviction path. This improves the overall performance when multiple
|
||||
+ * memcgs are available.
|
||||
+ */
|
||||
+ if (!sc->memcgs_need_aging) {
|
||||
+ sc->memcgs_need_aging = true;
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
set_mm_walk(pgdat);
|
||||
|
||||
memcg = mem_cgroup_iter(NULL, NULL, NULL);
|
||||
@@ -4613,7 +4632,8 @@ static int isolate_pages(struct lruvec *
|
||||
return scanned;
|
||||
}
|
||||
|
||||
-static int evict_pages(struct lruvec *lruvec, struct scan_control *sc, int swappiness)
|
||||
+static int evict_pages(struct lruvec *lruvec, struct scan_control *sc, int swappiness,
|
||||
+ bool *need_swapping)
|
||||
{
|
||||
int type;
|
||||
int scanned;
|
||||
@@ -4676,6 +4696,9 @@ static int evict_pages(struct lruvec *lr
|
||||
|
||||
sc->nr_reclaimed += reclaimed;
|
||||
|
||||
+ if (need_swapping && type == LRU_GEN_ANON)
|
||||
+ *need_swapping = true;
|
||||
+
|
||||
return scanned;
|
||||
}
|
||||
|
||||
@@ -4685,9 +4708,8 @@ static int evict_pages(struct lruvec *lr
|
||||
* reclaim.
|
||||
*/
|
||||
static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc,
|
||||
- bool can_swap)
|
||||
+ bool can_swap, bool *need_aging)
|
||||
{
|
||||
- bool need_aging;
|
||||
unsigned long nr_to_scan;
|
||||
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
|
||||
DEFINE_MAX_SEQ(lruvec);
|
||||
@@ -4697,8 +4719,8 @@ static unsigned long get_nr_to_scan(stru
|
||||
(mem_cgroup_below_low(memcg) && !sc->memcg_low_reclaim))
|
||||
return 0;
|
||||
|
||||
- need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, can_swap, &nr_to_scan);
|
||||
- if (!need_aging)
|
||||
+ *need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, can_swap, &nr_to_scan);
|
||||
+ if (!*need_aging)
|
||||
return nr_to_scan;
|
||||
|
||||
/* skip the aging path at the default priority */
|
||||
@@ -4715,10 +4737,68 @@ done:
|
||||
return min_seq[!can_swap] + MIN_NR_GENS <= max_seq ? nr_to_scan : 0;
|
||||
}
|
||||
|
||||
+static bool should_abort_scan(struct lruvec *lruvec, unsigned long seq,
|
||||
+ struct scan_control *sc, bool need_swapping)
|
||||
+{
|
||||
+ int i;
|
||||
+ DEFINE_MAX_SEQ(lruvec);
|
||||
+
|
||||
+ if (!current_is_kswapd()) {
|
||||
+ /* age each memcg once to ensure fairness */
|
||||
+ if (max_seq - seq > 1)
|
||||
+ return true;
|
||||
+
|
||||
+ /* over-swapping can increase allocation latency */
|
||||
+ if (sc->nr_reclaimed >= sc->nr_to_reclaim && need_swapping)
|
||||
+ return true;
|
||||
+
|
||||
+ /* give this thread a chance to exit and free its memory */
|
||||
+ if (fatal_signal_pending(current)) {
|
||||
+ sc->nr_reclaimed += MIN_LRU_BATCH;
|
||||
+ return true;
|
||||
+ }
|
||||
+
|
||||
+ if (cgroup_reclaim(sc))
|
||||
+ return false;
|
||||
+ } else if (sc->nr_reclaimed - sc->last_reclaimed < sc->nr_to_reclaim)
|
||||
+ return false;
|
||||
+
|
||||
+ /* keep scanning at low priorities to ensure fairness */
|
||||
+ if (sc->priority > DEF_PRIORITY - 2)
|
||||
+ return false;
|
||||
+
|
||||
+ /*
|
||||
+ * A minimum amount of work was done under global memory pressure. For
|
||||
+ * kswapd, it may be overshooting. For direct reclaim, the target isn't
|
||||
+ * met, and yet the allocation may still succeed, since kswapd may have
|
||||
+ * caught up. In either case, it's better to stop now, and restart if
|
||||
+ * necessary.
|
||||
+ */
|
||||
+ for (i = 0; i <= sc->reclaim_idx; i++) {
|
||||
+ unsigned long wmark;
|
||||
+ struct zone *zone = lruvec_pgdat(lruvec)->node_zones + i;
|
||||
+
|
||||
+ if (!managed_zone(zone))
|
||||
+ continue;
|
||||
+
|
||||
+ wmark = current_is_kswapd() ? high_wmark_pages(zone) : low_wmark_pages(zone);
|
||||
+ if (wmark > zone_page_state(zone, NR_FREE_PAGES))
|
||||
+ return false;
|
||||
+ }
|
||||
+
|
||||
+ sc->nr_reclaimed += MIN_LRU_BATCH;
|
||||
+
|
||||
+ return true;
|
||||
+}
|
||||
+
|
||||
static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
|
||||
{
|
||||
struct blk_plug plug;
|
||||
+ bool need_aging = false;
|
||||
+ bool need_swapping = false;
|
||||
unsigned long scanned = 0;
|
||||
+ unsigned long reclaimed = sc->nr_reclaimed;
|
||||
+ DEFINE_MAX_SEQ(lruvec);
|
||||
|
||||
lru_add_drain();
|
||||
|
||||
@@ -4738,21 +4818,28 @@ static void lru_gen_shrink_lruvec(struct
|
||||
else
|
||||
swappiness = 0;
|
||||
|
||||
- nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness);
|
||||
+ nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness, &need_aging);
|
||||
if (!nr_to_scan)
|
||||
- break;
|
||||
+ goto done;
|
||||
|
||||
- delta = evict_pages(lruvec, sc, swappiness);
|
||||
+ delta = evict_pages(lruvec, sc, swappiness, &need_swapping);
|
||||
if (!delta)
|
||||
- break;
|
||||
+ goto done;
|
||||
|
||||
scanned += delta;
|
||||
if (scanned >= nr_to_scan)
|
||||
break;
|
||||
|
||||
+ if (should_abort_scan(lruvec, max_seq, sc, need_swapping))
|
||||
+ break;
|
||||
+
|
||||
cond_resched();
|
||||
}
|
||||
|
||||
+ /* see the comment in lru_gen_age_node() */
|
||||
+ if (sc->nr_reclaimed - reclaimed >= MIN_LRU_BATCH && !need_aging)
|
||||
+ sc->memcgs_need_aging = false;
|
||||
+done:
|
||||
clear_mm_walk();
|
||||
|
||||
blk_finish_plug(&plug);
|
@ -0,0 +1,498 @@
|
||||
From 640db3a029dca909af47157ca18f52b29d34a1b9 Mon Sep 17 00:00:00 2001
|
||||
From: Yu Zhao <yuzhao@google.com>
|
||||
Date: Sun, 18 Sep 2022 02:00:07 -0600
|
||||
Subject: [PATCH 10/29] mm: multi-gen LRU: kill switch
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
Add /sys/kernel/mm/lru_gen/enabled as a kill switch. Components that
|
||||
can be disabled include:
|
||||
0x0001: the multi-gen LRU core
|
||||
0x0002: walking page table, when arch_has_hw_pte_young() returns
|
||||
true
|
||||
0x0004: clearing the accessed bit in non-leaf PMD entries, when
|
||||
CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG=y
|
||||
[yYnN]: apply to all the components above
|
||||
E.g.,
|
||||
echo y >/sys/kernel/mm/lru_gen/enabled
|
||||
cat /sys/kernel/mm/lru_gen/enabled
|
||||
0x0007
|
||||
echo 5 >/sys/kernel/mm/lru_gen/enabled
|
||||
cat /sys/kernel/mm/lru_gen/enabled
|
||||
0x0005
|
||||
|
||||
NB: the page table walks happen on the scale of seconds under heavy memory
|
||||
pressure, in which case the mmap_lock contention is a lesser concern,
|
||||
compared with the LRU lock contention and the I/O congestion. So far the
|
||||
only well-known case of the mmap_lock contention happens on Android, due
|
||||
to Scudo [1] which allocates several thousand VMAs for merely a few
|
||||
hundred MBs. The SPF and the Maple Tree also have provided their own
|
||||
assessments [2][3]. However, if walking page tables does worsen the
|
||||
mmap_lock contention, the kill switch can be used to disable it. In this
|
||||
case the multi-gen LRU will suffer a minor performance degradation, as
|
||||
shown previously.
|
||||
|
||||
Clearing the accessed bit in non-leaf PMD entries can also be disabled,
|
||||
since this behavior was not tested on x86 varieties other than Intel and
|
||||
AMD.
|
||||
|
||||
[1] https://source.android.com/devices/tech/debug/scudo
|
||||
[2] https://lore.kernel.org/r/20220128131006.67712-1-michel@lespinasse.org/
|
||||
[3] https://lore.kernel.org/r/20220426150616.3937571-1-Liam.Howlett@oracle.com/
|
||||
|
||||
Link: https://lkml.kernel.org/r/20220918080010.2920238-11-yuzhao@google.com
|
||||
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
||||
Acked-by: Brian Geffon <bgeffon@google.com>
|
||||
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
|
||||
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
|
||||
Acked-by: Steven Barrett <steven@liquorix.net>
|
||||
Acked-by: Suleiman Souhlal <suleiman@google.com>
|
||||
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
|
||||
Tested-by: Donald Carr <d@chaos-reins.com>
|
||||
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
|
||||
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
|
||||
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
|
||||
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
|
||||
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
|
||||
Cc: Andi Kleen <ak@linux.intel.com>
|
||||
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
|
||||
Cc: Barry Song <baohua@kernel.org>
|
||||
Cc: Catalin Marinas <catalin.marinas@arm.com>
|
||||
Cc: Dave Hansen <dave.hansen@linux.intel.com>
|
||||
Cc: Hillf Danton <hdanton@sina.com>
|
||||
Cc: Jens Axboe <axboe@kernel.dk>
|
||||
Cc: Johannes Weiner <hannes@cmpxchg.org>
|
||||
Cc: Jonathan Corbet <corbet@lwn.net>
|
||||
Cc: Linus Torvalds <torvalds@linux-foundation.org>
|
||||
Cc: Matthew Wilcox <willy@infradead.org>
|
||||
Cc: Mel Gorman <mgorman@suse.de>
|
||||
Cc: Miaohe Lin <linmiaohe@huawei.com>
|
||||
Cc: Michael Larabel <Michael@MichaelLarabel.com>
|
||||
Cc: Michal Hocko <mhocko@kernel.org>
|
||||
Cc: Mike Rapoport <rppt@kernel.org>
|
||||
Cc: Mike Rapoport <rppt@linux.ibm.com>
|
||||
Cc: Peter Zijlstra <peterz@infradead.org>
|
||||
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
|
||||
Cc: Tejun Heo <tj@kernel.org>
|
||||
Cc: Vlastimil Babka <vbabka@suse.cz>
|
||||
Cc: Will Deacon <will@kernel.org>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
include/linux/cgroup.h | 15 ++-
|
||||
include/linux/mm_inline.h | 15 ++-
|
||||
include/linux/mmzone.h | 9 ++
|
||||
kernel/cgroup/cgroup-internal.h | 1 -
|
||||
mm/Kconfig | 6 +
|
||||
mm/vmscan.c | 228 +++++++++++++++++++++++++++++++-
|
||||
6 files changed, 265 insertions(+), 9 deletions(-)
|
||||
|
||||
--- a/include/linux/cgroup.h
|
||||
+++ b/include/linux/cgroup.h
|
||||
@@ -433,6 +433,18 @@ static inline void cgroup_put(struct cgr
|
||||
css_put(&cgrp->self);
|
||||
}
|
||||
|
||||
+extern struct mutex cgroup_mutex;
|
||||
+
|
||||
+static inline void cgroup_lock(void)
|
||||
+{
|
||||
+ mutex_lock(&cgroup_mutex);
|
||||
+}
|
||||
+
|
||||
+static inline void cgroup_unlock(void)
|
||||
+{
|
||||
+ mutex_unlock(&cgroup_mutex);
|
||||
+}
|
||||
+
|
||||
/**
|
||||
* task_css_set_check - obtain a task's css_set with extra access conditions
|
||||
* @task: the task to obtain css_set for
|
||||
@@ -447,7 +459,6 @@ static inline void cgroup_put(struct cgr
|
||||
* as locks used during the cgroup_subsys::attach() methods.
|
||||
*/
|
||||
#ifdef CONFIG_PROVE_RCU
|
||||
-extern struct mutex cgroup_mutex;
|
||||
extern spinlock_t css_set_lock;
|
||||
#define task_css_set_check(task, __c) \
|
||||
rcu_dereference_check((task)->cgroups, \
|
||||
@@ -708,6 +719,8 @@ struct cgroup;
|
||||
static inline u64 cgroup_id(const struct cgroup *cgrp) { return 1; }
|
||||
static inline void css_get(struct cgroup_subsys_state *css) {}
|
||||
static inline void css_put(struct cgroup_subsys_state *css) {}
|
||||
+static inline void cgroup_lock(void) {}
|
||||
+static inline void cgroup_unlock(void) {}
|
||||
static inline int cgroup_attach_task_all(struct task_struct *from,
|
||||
struct task_struct *t) { return 0; }
|
||||
static inline int cgroupstats_build(struct cgroupstats *stats,
|
||||
--- a/include/linux/mm_inline.h
|
||||
+++ b/include/linux/mm_inline.h
|
||||
@@ -91,10 +91,21 @@ static __always_inline enum lru_list pag
|
||||
|
||||
#ifdef CONFIG_LRU_GEN
|
||||
|
||||
+#ifdef CONFIG_LRU_GEN_ENABLED
|
||||
static inline bool lru_gen_enabled(void)
|
||||
{
|
||||
- return true;
|
||||
+ DECLARE_STATIC_KEY_TRUE(lru_gen_caps[NR_LRU_GEN_CAPS]);
|
||||
+
|
||||
+ return static_branch_likely(&lru_gen_caps[LRU_GEN_CORE]);
|
||||
+}
|
||||
+#else
|
||||
+static inline bool lru_gen_enabled(void)
|
||||
+{
|
||||
+ DECLARE_STATIC_KEY_FALSE(lru_gen_caps[NR_LRU_GEN_CAPS]);
|
||||
+
|
||||
+ return static_branch_unlikely(&lru_gen_caps[LRU_GEN_CORE]);
|
||||
}
|
||||
+#endif
|
||||
|
||||
static inline bool lru_gen_in_fault(void)
|
||||
{
|
||||
@@ -207,7 +218,7 @@ static inline bool lru_gen_add_page(stru
|
||||
|
||||
VM_WARN_ON_ONCE_PAGE(gen != -1, page);
|
||||
|
||||
- if (PageUnevictable(page))
|
||||
+ if (PageUnevictable(page) || !lrugen->enabled)
|
||||
return false;
|
||||
/*
|
||||
* There are three common cases for this page:
|
||||
--- a/include/linux/mmzone.h
|
||||
+++ b/include/linux/mmzone.h
|
||||
@@ -364,6 +364,13 @@ enum {
|
||||
LRU_GEN_FILE,
|
||||
};
|
||||
|
||||
+enum {
|
||||
+ LRU_GEN_CORE,
|
||||
+ LRU_GEN_MM_WALK,
|
||||
+ LRU_GEN_NONLEAF_YOUNG,
|
||||
+ NR_LRU_GEN_CAPS
|
||||
+};
|
||||
+
|
||||
#define MIN_LRU_BATCH BITS_PER_LONG
|
||||
#define MAX_LRU_BATCH (MIN_LRU_BATCH * 64)
|
||||
|
||||
@@ -405,6 +412,8 @@ struct lru_gen_struct {
|
||||
/* can be modified without holding the LRU lock */
|
||||
atomic_long_t evicted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
|
||||
atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
|
||||
+ /* whether the multi-gen LRU is enabled */
|
||||
+ bool enabled;
|
||||
};
|
||||
|
||||
enum {
|
||||
--- a/kernel/cgroup/cgroup-internal.h
|
||||
+++ b/kernel/cgroup/cgroup-internal.h
|
||||
@@ -165,7 +165,6 @@ struct cgroup_mgctx {
|
||||
#define DEFINE_CGROUP_MGCTX(name) \
|
||||
struct cgroup_mgctx name = CGROUP_MGCTX_INIT(name)
|
||||
|
||||
-extern struct mutex cgroup_mutex;
|
||||
extern spinlock_t css_set_lock;
|
||||
extern struct cgroup_subsys *cgroup_subsys[];
|
||||
extern struct list_head cgroup_roots;
|
||||
--- a/mm/Kconfig
|
||||
+++ b/mm/Kconfig
|
||||
@@ -906,6 +906,12 @@ config LRU_GEN
|
||||
help
|
||||
A high performance LRU implementation to overcommit memory.
|
||||
|
||||
+config LRU_GEN_ENABLED
|
||||
+ bool "Enable by default"
|
||||
+ depends on LRU_GEN
|
||||
+ help
|
||||
+ This option enables the multi-gen LRU by default.
|
||||
+
|
||||
config LRU_GEN_STATS
|
||||
bool "Full stats for debugging"
|
||||
depends on LRU_GEN
|
||||
--- a/mm/vmscan.c
|
||||
+++ b/mm/vmscan.c
|
||||
@@ -52,6 +52,7 @@
|
||||
#include <linux/psi.h>
|
||||
#include <linux/pagewalk.h>
|
||||
#include <linux/shmem_fs.h>
|
||||
+#include <linux/ctype.h>
|
||||
|
||||
#include <asm/tlbflush.h>
|
||||
#include <asm/div64.h>
|
||||
@@ -2841,6 +2842,14 @@ static bool can_age_anon_pages(struct pg
|
||||
|
||||
#ifdef CONFIG_LRU_GEN
|
||||
|
||||
+#ifdef CONFIG_LRU_GEN_ENABLED
|
||||
+DEFINE_STATIC_KEY_ARRAY_TRUE(lru_gen_caps, NR_LRU_GEN_CAPS);
|
||||
+#define get_cap(cap) static_branch_likely(&lru_gen_caps[cap])
|
||||
+#else
|
||||
+DEFINE_STATIC_KEY_ARRAY_FALSE(lru_gen_caps, NR_LRU_GEN_CAPS);
|
||||
+#define get_cap(cap) static_branch_unlikely(&lru_gen_caps[cap])
|
||||
+#endif
|
||||
+
|
||||
/******************************************************************************
|
||||
* shorthand helpers
|
||||
******************************************************************************/
|
||||
@@ -3717,7 +3726,8 @@ static void walk_pmd_range_locked(pud_t
|
||||
goto next;
|
||||
|
||||
if (!pmd_trans_huge(pmd[i])) {
|
||||
- if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG))
|
||||
+ if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) &&
|
||||
+ get_cap(LRU_GEN_NONLEAF_YOUNG))
|
||||
pmdp_test_and_clear_young(vma, addr, pmd + i);
|
||||
goto next;
|
||||
}
|
||||
@@ -3815,10 +3825,12 @@ restart:
|
||||
walk->mm_stats[MM_NONLEAF_TOTAL]++;
|
||||
|
||||
#ifdef CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG
|
||||
- if (!pmd_young(val))
|
||||
- continue;
|
||||
+ if (get_cap(LRU_GEN_NONLEAF_YOUNG)) {
|
||||
+ if (!pmd_young(val))
|
||||
+ continue;
|
||||
|
||||
- walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos);
|
||||
+ walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos);
|
||||
+ }
|
||||
#endif
|
||||
if (!walk->force_scan && !test_bloom_filter(walk->lruvec, walk->max_seq, pmd + i))
|
||||
continue;
|
||||
@@ -4080,7 +4092,7 @@ static bool try_to_inc_max_seq(struct lr
|
||||
* handful of PTEs. Spreading the work out over a period of time usually
|
||||
* is less efficient, but it avoids bursty page faults.
|
||||
*/
|
||||
- if (!arch_has_hw_pte_young()) {
|
||||
+ if (!(arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK))) {
|
||||
success = iterate_mm_list_nowalk(lruvec, max_seq);
|
||||
goto done;
|
||||
}
|
||||
@@ -4846,6 +4858,208 @@ done:
|
||||
}
|
||||
|
||||
/******************************************************************************
|
||||
+ * state change
|
||||
+ ******************************************************************************/
|
||||
+
|
||||
+static bool __maybe_unused state_is_valid(struct lruvec *lruvec)
|
||||
+{
|
||||
+ struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
||||
+
|
||||
+ if (lrugen->enabled) {
|
||||
+ enum lru_list lru;
|
||||
+
|
||||
+ for_each_evictable_lru(lru) {
|
||||
+ if (!list_empty(&lruvec->lists[lru]))
|
||||
+ return false;
|
||||
+ }
|
||||
+ } else {
|
||||
+ int gen, type, zone;
|
||||
+
|
||||
+ for_each_gen_type_zone(gen, type, zone) {
|
||||
+ if (!list_empty(&lrugen->lists[gen][type][zone]))
|
||||
+ return false;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ return true;
|
||||
+}
|
||||
+
|
||||
+static bool fill_evictable(struct lruvec *lruvec)
|
||||
+{
|
||||
+ enum lru_list lru;
|
||||
+ int remaining = MAX_LRU_BATCH;
|
||||
+
|
||||
+ for_each_evictable_lru(lru) {
|
||||
+ int type = is_file_lru(lru);
|
||||
+ bool active = is_active_lru(lru);
|
||||
+ struct list_head *head = &lruvec->lists[lru];
|
||||
+
|
||||
+ while (!list_empty(head)) {
|
||||
+ bool success;
|
||||
+ struct page *page = lru_to_page(head);
|
||||
+
|
||||
+ VM_WARN_ON_ONCE_PAGE(PageUnevictable(page), page);
|
||||
+ VM_WARN_ON_ONCE_PAGE(PageActive(page) != active, page);
|
||||
+ VM_WARN_ON_ONCE_PAGE(page_is_file_lru(page) != type, page);
|
||||
+ VM_WARN_ON_ONCE_PAGE(page_lru_gen(page) != -1, page);
|
||||
+
|
||||
+ del_page_from_lru_list(page, lruvec);
|
||||
+ success = lru_gen_add_page(lruvec, page, false);
|
||||
+ VM_WARN_ON_ONCE(!success);
|
||||
+
|
||||
+ if (!--remaining)
|
||||
+ return false;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ return true;
|
||||
+}
|
||||
+
|
||||
+static bool drain_evictable(struct lruvec *lruvec)
|
||||
+{
|
||||
+ int gen, type, zone;
|
||||
+ int remaining = MAX_LRU_BATCH;
|
||||
+
|
||||
+ for_each_gen_type_zone(gen, type, zone) {
|
||||
+ struct list_head *head = &lruvec->lrugen.lists[gen][type][zone];
|
||||
+
|
||||
+ while (!list_empty(head)) {
|
||||
+ bool success;
|
||||
+ struct page *page = lru_to_page(head);
|
||||
+
|
||||
+ VM_WARN_ON_ONCE_PAGE(PageUnevictable(page), page);
|
||||
+ VM_WARN_ON_ONCE_PAGE(PageActive(page), page);
|
||||
+ VM_WARN_ON_ONCE_PAGE(page_is_file_lru(page) != type, page);
|
||||
+ VM_WARN_ON_ONCE_PAGE(page_zonenum(page) != zone, page);
|
||||
+
|
||||
+ success = lru_gen_del_page(lruvec, page, false);
|
||||
+ VM_WARN_ON_ONCE(!success);
|
||||
+ add_page_to_lru_list(page, lruvec);
|
||||
+
|
||||
+ if (!--remaining)
|
||||
+ return false;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ return true;
|
||||
+}
|
||||
+
|
||||
+static void lru_gen_change_state(bool enabled)
|
||||
+{
|
||||
+ static DEFINE_MUTEX(state_mutex);
|
||||
+
|
||||
+ struct mem_cgroup *memcg;
|
||||
+
|
||||
+ cgroup_lock();
|
||||
+ cpus_read_lock();
|
||||
+ get_online_mems();
|
||||
+ mutex_lock(&state_mutex);
|
||||
+
|
||||
+ if (enabled == lru_gen_enabled())
|
||||
+ goto unlock;
|
||||
+
|
||||
+ if (enabled)
|
||||
+ static_branch_enable_cpuslocked(&lru_gen_caps[LRU_GEN_CORE]);
|
||||
+ else
|
||||
+ static_branch_disable_cpuslocked(&lru_gen_caps[LRU_GEN_CORE]);
|
||||
+
|
||||
+ memcg = mem_cgroup_iter(NULL, NULL, NULL);
|
||||
+ do {
|
||||
+ int nid;
|
||||
+
|
||||
+ for_each_node(nid) {
|
||||
+ struct lruvec *lruvec = get_lruvec(memcg, nid);
|
||||
+
|
||||
+ if (!lruvec)
|
||||
+ continue;
|
||||
+
|
||||
+ spin_lock_irq(&lruvec->lru_lock);
|
||||
+
|
||||
+ VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
|
||||
+ VM_WARN_ON_ONCE(!state_is_valid(lruvec));
|
||||
+
|
||||
+ lruvec->lrugen.enabled = enabled;
|
||||
+
|
||||
+ while (!(enabled ? fill_evictable(lruvec) : drain_evictable(lruvec))) {
|
||||
+ spin_unlock_irq(&lruvec->lru_lock);
|
||||
+ cond_resched();
|
||||
+ spin_lock_irq(&lruvec->lru_lock);
|
||||
+ }
|
||||
+
|
||||
+ spin_unlock_irq(&lruvec->lru_lock);
|
||||
+ }
|
||||
+
|
||||
+ cond_resched();
|
||||
+ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
|
||||
+unlock:
|
||||
+ mutex_unlock(&state_mutex);
|
||||
+ put_online_mems();
|
||||
+ cpus_read_unlock();
|
||||
+ cgroup_unlock();
|
||||
+}
|
||||
+
|
||||
+/******************************************************************************
|
||||
+ * sysfs interface
|
||||
+ ******************************************************************************/
|
||||
+
|
||||
+static ssize_t show_enabled(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
|
||||
+{
|
||||
+ unsigned int caps = 0;
|
||||
+
|
||||
+ if (get_cap(LRU_GEN_CORE))
|
||||
+ caps |= BIT(LRU_GEN_CORE);
|
||||
+
|
||||
+ if (arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK))
|
||||
+ caps |= BIT(LRU_GEN_MM_WALK);
|
||||
+
|
||||
+ if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) && get_cap(LRU_GEN_NONLEAF_YOUNG))
|
||||
+ caps |= BIT(LRU_GEN_NONLEAF_YOUNG);
|
||||
+
|
||||
+ return snprintf(buf, PAGE_SIZE, "0x%04x\n", caps);
|
||||
+}
|
||||
+
|
||||
+static ssize_t store_enabled(struct kobject *kobj, struct kobj_attribute *attr,
|
||||
+ const char *buf, size_t len)
|
||||
+{
|
||||
+ int i;
|
||||
+ unsigned int caps;
|
||||
+
|
||||
+ if (tolower(*buf) == 'n')
|
||||
+ caps = 0;
|
||||
+ else if (tolower(*buf) == 'y')
|
||||
+ caps = -1;
|
||||
+ else if (kstrtouint(buf, 0, &caps))
|
||||
+ return -EINVAL;
|
||||
+
|
||||
+ for (i = 0; i < NR_LRU_GEN_CAPS; i++) {
|
||||
+ bool enabled = caps & BIT(i);
|
||||
+
|
||||
+ if (i == LRU_GEN_CORE)
|
||||
+ lru_gen_change_state(enabled);
|
||||
+ else if (enabled)
|
||||
+ static_branch_enable(&lru_gen_caps[i]);
|
||||
+ else
|
||||
+ static_branch_disable(&lru_gen_caps[i]);
|
||||
+ }
|
||||
+
|
||||
+ return len;
|
||||
+}
|
||||
+
|
||||
+static struct kobj_attribute lru_gen_enabled_attr = __ATTR(
|
||||
+ enabled, 0644, show_enabled, store_enabled
|
||||
+);
|
||||
+
|
||||
+static struct attribute *lru_gen_attrs[] = {
|
||||
+ &lru_gen_enabled_attr.attr,
|
||||
+ NULL
|
||||
+};
|
||||
+
|
||||
+static struct attribute_group lru_gen_attr_group = {
|
||||
+ .name = "lru_gen",
|
||||
+ .attrs = lru_gen_attrs,
|
||||
+};
|
||||
+
|
||||
+/******************************************************************************
|
||||
* initialization
|
||||
******************************************************************************/
|
||||
|
||||
@@ -4855,6 +5069,7 @@ void lru_gen_init_lruvec(struct lruvec *
|
||||
struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
||||
|
||||
lrugen->max_seq = MIN_NR_GENS + 1;
|
||||
+ lrugen->enabled = lru_gen_enabled();
|
||||
|
||||
for_each_gen_type_zone(gen, type, zone)
|
||||
INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]);
|
||||
@@ -4894,6 +5109,9 @@ static int __init init_lru_gen(void)
|
||||
BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS);
|
||||
BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS);
|
||||
|
||||
+ if (sysfs_create_group(mm_kobj, &lru_gen_attr_group))
|
||||
+ pr_err("lru_gen: failed to create sysfs group\n");
|
||||
+
|
||||
return 0;
|
||||
};
|
||||
late_initcall(init_lru_gen);
|
@ -0,0 +1,226 @@
|
||||
From 73d1ff551760f0c79c47ab70faa4c2ca91413f5c Mon Sep 17 00:00:00 2001
|
||||
From: Yu Zhao <yuzhao@google.com>
|
||||
Date: Sun, 18 Sep 2022 02:00:08 -0600
|
||||
Subject: [PATCH 11/29] mm: multi-gen LRU: thrashing prevention
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
Add /sys/kernel/mm/lru_gen/min_ttl_ms for thrashing prevention, as
|
||||
requested by many desktop users [1].
|
||||
|
||||
When set to value N, it prevents the working set of N milliseconds from
|
||||
getting evicted. The OOM killer is triggered if this working set cannot
|
||||
be kept in memory. Based on the average human detectable lag (~100ms),
|
||||
N=1000 usually eliminates intolerable lags due to thrashing. Larger
|
||||
values like N=3000 make lags less noticeable at the risk of premature OOM
|
||||
kills.
|
||||
|
||||
Compared with the size-based approach [2], this time-based approach
|
||||
has the following advantages:
|
||||
|
||||
1. It is easier to configure because it is agnostic to applications
|
||||
and memory sizes.
|
||||
2. It is more reliable because it is directly wired to the OOM killer.
|
||||
|
||||
[1] https://lore.kernel.org/r/Ydza%2FzXKY9ATRoh6@google.com/
|
||||
[2] https://lore.kernel.org/r/20101028191523.GA14972@google.com/
|
||||
|
||||
Link: https://lkml.kernel.org/r/20220918080010.2920238-12-yuzhao@google.com
|
||||
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
||||
Acked-by: Brian Geffon <bgeffon@google.com>
|
||||
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
|
||||
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
|
||||
Acked-by: Steven Barrett <steven@liquorix.net>
|
||||
Acked-by: Suleiman Souhlal <suleiman@google.com>
|
||||
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
|
||||
Tested-by: Donald Carr <d@chaos-reins.com>
|
||||
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
|
||||
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
|
||||
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
|
||||
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
|
||||
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
|
||||
Cc: Andi Kleen <ak@linux.intel.com>
|
||||
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
|
||||
Cc: Barry Song <baohua@kernel.org>
|
||||
Cc: Catalin Marinas <catalin.marinas@arm.com>
|
||||
Cc: Dave Hansen <dave.hansen@linux.intel.com>
|
||||
Cc: Hillf Danton <hdanton@sina.com>
|
||||
Cc: Jens Axboe <axboe@kernel.dk>
|
||||
Cc: Johannes Weiner <hannes@cmpxchg.org>
|
||||
Cc: Jonathan Corbet <corbet@lwn.net>
|
||||
Cc: Linus Torvalds <torvalds@linux-foundation.org>
|
||||
Cc: Matthew Wilcox <willy@infradead.org>
|
||||
Cc: Mel Gorman <mgorman@suse.de>
|
||||
Cc: Miaohe Lin <linmiaohe@huawei.com>
|
||||
Cc: Michael Larabel <Michael@MichaelLarabel.com>
|
||||
Cc: Michal Hocko <mhocko@kernel.org>
|
||||
Cc: Mike Rapoport <rppt@kernel.org>
|
||||
Cc: Mike Rapoport <rppt@linux.ibm.com>
|
||||
Cc: Peter Zijlstra <peterz@infradead.org>
|
||||
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
|
||||
Cc: Tejun Heo <tj@kernel.org>
|
||||
Cc: Vlastimil Babka <vbabka@suse.cz>
|
||||
Cc: Will Deacon <will@kernel.org>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
include/linux/mmzone.h | 2 ++
|
||||
mm/vmscan.c | 74 ++++++++++++++++++++++++++++++++++++++++--
|
||||
2 files changed, 73 insertions(+), 3 deletions(-)
|
||||
|
||||
--- a/include/linux/mmzone.h
|
||||
+++ b/include/linux/mmzone.h
|
||||
@@ -399,6 +399,8 @@ struct lru_gen_struct {
|
||||
unsigned long max_seq;
|
||||
/* the eviction increments the oldest generation numbers */
|
||||
unsigned long min_seq[ANON_AND_FILE];
|
||||
+ /* the birth time of each generation in jiffies */
|
||||
+ unsigned long timestamps[MAX_NR_GENS];
|
||||
/* the multi-gen LRU lists, lazily sorted on eviction */
|
||||
struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
|
||||
/* the multi-gen LRU sizes, eventually consistent */
|
||||
--- a/mm/vmscan.c
|
||||
+++ b/mm/vmscan.c
|
||||
@@ -4064,6 +4064,7 @@ static void inc_max_seq(struct lruvec *l
|
||||
for (type = 0; type < ANON_AND_FILE; type++)
|
||||
reset_ctrl_pos(lruvec, type, false);
|
||||
|
||||
+ WRITE_ONCE(lrugen->timestamps[next], jiffies);
|
||||
/* make sure preceding modifications appear */
|
||||
smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1);
|
||||
|
||||
@@ -4193,7 +4194,7 @@ static bool should_run_aging(struct lruv
|
||||
return false;
|
||||
}
|
||||
|
||||
-static void age_lruvec(struct lruvec *lruvec, struct scan_control *sc)
|
||||
+static bool age_lruvec(struct lruvec *lruvec, struct scan_control *sc, unsigned long min_ttl)
|
||||
{
|
||||
bool need_aging;
|
||||
unsigned long nr_to_scan;
|
||||
@@ -4207,16 +4208,36 @@ static void age_lruvec(struct lruvec *lr
|
||||
mem_cgroup_calculate_protection(NULL, memcg);
|
||||
|
||||
if (mem_cgroup_below_min(memcg))
|
||||
- return;
|
||||
+ return false;
|
||||
|
||||
need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, swappiness, &nr_to_scan);
|
||||
+
|
||||
+ if (min_ttl) {
|
||||
+ int gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]);
|
||||
+ unsigned long birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);
|
||||
+
|
||||
+ if (time_is_after_jiffies(birth + min_ttl))
|
||||
+ return false;
|
||||
+
|
||||
+ /* the size is likely too small to be helpful */
|
||||
+ if (!nr_to_scan && sc->priority != DEF_PRIORITY)
|
||||
+ return false;
|
||||
+ }
|
||||
+
|
||||
if (need_aging)
|
||||
try_to_inc_max_seq(lruvec, max_seq, sc, swappiness);
|
||||
+
|
||||
+ return true;
|
||||
}
|
||||
|
||||
+/* to protect the working set of the last N jiffies */
|
||||
+static unsigned long lru_gen_min_ttl __read_mostly;
|
||||
+
|
||||
static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
|
||||
{
|
||||
struct mem_cgroup *memcg;
|
||||
+ bool success = false;
|
||||
+ unsigned long min_ttl = READ_ONCE(lru_gen_min_ttl);
|
||||
|
||||
VM_WARN_ON_ONCE(!current_is_kswapd());
|
||||
|
||||
@@ -4239,12 +4260,32 @@ static void lru_gen_age_node(struct pgli
|
||||
do {
|
||||
struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
|
||||
|
||||
- age_lruvec(lruvec, sc);
|
||||
+ if (age_lruvec(lruvec, sc, min_ttl))
|
||||
+ success = true;
|
||||
|
||||
cond_resched();
|
||||
} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
|
||||
|
||||
clear_mm_walk();
|
||||
+
|
||||
+ /* check the order to exclude compaction-induced reclaim */
|
||||
+ if (success || !min_ttl || sc->order)
|
||||
+ return;
|
||||
+
|
||||
+ /*
|
||||
+ * The main goal is to OOM kill if every generation from all memcgs is
|
||||
+ * younger than min_ttl. However, another possibility is all memcgs are
|
||||
+ * either below min or empty.
|
||||
+ */
|
||||
+ if (mutex_trylock(&oom_lock)) {
|
||||
+ struct oom_control oc = {
|
||||
+ .gfp_mask = sc->gfp_mask,
|
||||
+ };
|
||||
+
|
||||
+ out_of_memory(&oc);
|
||||
+
|
||||
+ mutex_unlock(&oom_lock);
|
||||
+ }
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -5002,6 +5043,28 @@ unlock:
|
||||
* sysfs interface
|
||||
******************************************************************************/
|
||||
|
||||
+static ssize_t show_min_ttl(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
|
||||
+{
|
||||
+ return sprintf(buf, "%u\n", jiffies_to_msecs(READ_ONCE(lru_gen_min_ttl)));
|
||||
+}
|
||||
+
|
||||
+static ssize_t store_min_ttl(struct kobject *kobj, struct kobj_attribute *attr,
|
||||
+ const char *buf, size_t len)
|
||||
+{
|
||||
+ unsigned int msecs;
|
||||
+
|
||||
+ if (kstrtouint(buf, 0, &msecs))
|
||||
+ return -EINVAL;
|
||||
+
|
||||
+ WRITE_ONCE(lru_gen_min_ttl, msecs_to_jiffies(msecs));
|
||||
+
|
||||
+ return len;
|
||||
+}
|
||||
+
|
||||
+static struct kobj_attribute lru_gen_min_ttl_attr = __ATTR(
|
||||
+ min_ttl_ms, 0644, show_min_ttl, store_min_ttl
|
||||
+);
|
||||
+
|
||||
static ssize_t show_enabled(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
|
||||
{
|
||||
unsigned int caps = 0;
|
||||
@@ -5050,6 +5113,7 @@ static struct kobj_attribute lru_gen_ena
|
||||
);
|
||||
|
||||
static struct attribute *lru_gen_attrs[] = {
|
||||
+ &lru_gen_min_ttl_attr.attr,
|
||||
&lru_gen_enabled_attr.attr,
|
||||
NULL
|
||||
};
|
||||
@@ -5065,12 +5129,16 @@ static struct attribute_group lru_gen_at
|
||||
|
||||
void lru_gen_init_lruvec(struct lruvec *lruvec)
|
||||
{
|
||||
+ int i;
|
||||
int gen, type, zone;
|
||||
struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
||||
|
||||
lrugen->max_seq = MIN_NR_GENS + 1;
|
||||
lrugen->enabled = lru_gen_enabled();
|
||||
|
||||
+ for (i = 0; i <= MIN_NR_GENS + 1; i++)
|
||||
+ lrugen->timestamps[i] = jiffies;
|
||||
+
|
||||
for_each_gen_type_zone(gen, type, zone)
|
||||
INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]);
|
||||
|
@ -0,0 +1,579 @@
|
||||
From 530716d008ca26315f246cd70dc1cefc636beaa4 Mon Sep 17 00:00:00 2001
|
||||
From: Yu Zhao <yuzhao@google.com>
|
||||
Date: Sun, 18 Sep 2022 02:00:09 -0600
|
||||
Subject: [PATCH 12/29] mm: multi-gen LRU: debugfs interface
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
Add /sys/kernel/debug/lru_gen for working set estimation and proactive
|
||||
reclaim. These techniques are commonly used to optimize job scheduling
|
||||
(bin packing) in data centers [1][2].
|
||||
|
||||
Compared with the page table-based approach and the PFN-based
|
||||
approach, this lruvec-based approach has the following advantages:
|
||||
1. It offers better choices because it is aware of memcgs, NUMA nodes,
|
||||
shared mappings and unmapped page cache.
|
||||
2. It is more scalable because it is O(nr_hot_pages), whereas the
|
||||
PFN-based approach is O(nr_total_pages).
|
||||
|
||||
Add /sys/kernel/debug/lru_gen_full for debugging.
|
||||
|
||||
[1] https://dl.acm.org/doi/10.1145/3297858.3304053
|
||||
[2] https://dl.acm.org/doi/10.1145/3503222.3507731
|
||||
|
||||
Link: https://lkml.kernel.org/r/20220918080010.2920238-13-yuzhao@google.com
|
||||
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
||||
Reviewed-by: Qi Zheng <zhengqi.arch@bytedance.com>
|
||||
Acked-by: Brian Geffon <bgeffon@google.com>
|
||||
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
|
||||
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
|
||||
Acked-by: Steven Barrett <steven@liquorix.net>
|
||||
Acked-by: Suleiman Souhlal <suleiman@google.com>
|
||||
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
|
||||
Tested-by: Donald Carr <d@chaos-reins.com>
|
||||
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
|
||||
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
|
||||
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
|
||||
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
|
||||
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
|
||||
Cc: Andi Kleen <ak@linux.intel.com>
|
||||
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
|
||||
Cc: Barry Song <baohua@kernel.org>
|
||||
Cc: Catalin Marinas <catalin.marinas@arm.com>
|
||||
Cc: Dave Hansen <dave.hansen@linux.intel.com>
|
||||
Cc: Hillf Danton <hdanton@sina.com>
|
||||
Cc: Jens Axboe <axboe@kernel.dk>
|
||||
Cc: Johannes Weiner <hannes@cmpxchg.org>
|
||||
Cc: Jonathan Corbet <corbet@lwn.net>
|
||||
Cc: Linus Torvalds <torvalds@linux-foundation.org>
|
||||
Cc: Matthew Wilcox <willy@infradead.org>
|
||||
Cc: Mel Gorman <mgorman@suse.de>
|
||||
Cc: Miaohe Lin <linmiaohe@huawei.com>
|
||||
Cc: Michael Larabel <Michael@MichaelLarabel.com>
|
||||
Cc: Michal Hocko <mhocko@kernel.org>
|
||||
Cc: Mike Rapoport <rppt@kernel.org>
|
||||
Cc: Mike Rapoport <rppt@linux.ibm.com>
|
||||
Cc: Peter Zijlstra <peterz@infradead.org>
|
||||
Cc: Tejun Heo <tj@kernel.org>
|
||||
Cc: Vlastimil Babka <vbabka@suse.cz>
|
||||
Cc: Will Deacon <will@kernel.org>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
include/linux/nodemask.h | 1 +
|
||||
mm/vmscan.c | 411 ++++++++++++++++++++++++++++++++++++++-
|
||||
2 files changed, 402 insertions(+), 10 deletions(-)
|
||||
|
||||
--- a/include/linux/nodemask.h
|
||||
+++ b/include/linux/nodemask.h
|
||||
@@ -485,6 +485,7 @@ static inline int num_node_state(enum no
|
||||
#define first_online_node 0
|
||||
#define first_memory_node 0
|
||||
#define next_online_node(nid) (MAX_NUMNODES)
|
||||
+#define next_memory_node(nid) (MAX_NUMNODES)
|
||||
#define nr_node_ids 1U
|
||||
#define nr_online_nodes 1U
|
||||
|
||||
--- a/mm/vmscan.c
|
||||
+++ b/mm/vmscan.c
|
||||
@@ -53,6 +53,7 @@
|
||||
#include <linux/pagewalk.h>
|
||||
#include <linux/shmem_fs.h>
|
||||
#include <linux/ctype.h>
|
||||
+#include <linux/debugfs.h>
|
||||
|
||||
#include <asm/tlbflush.h>
|
||||
#include <asm/div64.h>
|
||||
@@ -3968,12 +3969,40 @@ static void clear_mm_walk(void)
|
||||
kfree(walk);
|
||||
}
|
||||
|
||||
-static void inc_min_seq(struct lruvec *lruvec, int type)
|
||||
+static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap)
|
||||
{
|
||||
+ int zone;
|
||||
+ int remaining = MAX_LRU_BATCH;
|
||||
struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
||||
+ int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]);
|
||||
+
|
||||
+ if (type == LRU_GEN_ANON && !can_swap)
|
||||
+ goto done;
|
||||
+
|
||||
+ /* prevent cold/hot inversion if force_scan is true */
|
||||
+ for (zone = 0; zone < MAX_NR_ZONES; zone++) {
|
||||
+ struct list_head *head = &lrugen->lists[old_gen][type][zone];
|
||||
+
|
||||
+ while (!list_empty(head)) {
|
||||
+ struct page *page = lru_to_page(head);
|
||||
+
|
||||
+ VM_WARN_ON_ONCE_PAGE(PageUnevictable(page), page);
|
||||
+ VM_WARN_ON_ONCE_PAGE(PageActive(page), page);
|
||||
+ VM_WARN_ON_ONCE_PAGE(page_is_file_lru(page) != type, page);
|
||||
+ VM_WARN_ON_ONCE_PAGE(page_zonenum(page) != zone, page);
|
||||
|
||||
+ new_gen = page_inc_gen(lruvec, page, false);
|
||||
+ list_move_tail(&page->lru, &lrugen->lists[new_gen][type][zone]);
|
||||
+
|
||||
+ if (!--remaining)
|
||||
+ return false;
|
||||
+ }
|
||||
+ }
|
||||
+done:
|
||||
reset_ctrl_pos(lruvec, type, true);
|
||||
WRITE_ONCE(lrugen->min_seq[type], lrugen->min_seq[type] + 1);
|
||||
+
|
||||
+ return true;
|
||||
}
|
||||
|
||||
static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap)
|
||||
@@ -4019,7 +4048,7 @@ next:
|
||||
return success;
|
||||
}
|
||||
|
||||
-static void inc_max_seq(struct lruvec *lruvec, bool can_swap)
|
||||
+static void inc_max_seq(struct lruvec *lruvec, bool can_swap, bool force_scan)
|
||||
{
|
||||
int prev, next;
|
||||
int type, zone;
|
||||
@@ -4033,9 +4062,13 @@ static void inc_max_seq(struct lruvec *l
|
||||
if (get_nr_gens(lruvec, type) != MAX_NR_GENS)
|
||||
continue;
|
||||
|
||||
- VM_WARN_ON_ONCE(type == LRU_GEN_FILE || can_swap);
|
||||
+ VM_WARN_ON_ONCE(!force_scan && (type == LRU_GEN_FILE || can_swap));
|
||||
|
||||
- inc_min_seq(lruvec, type);
|
||||
+ while (!inc_min_seq(lruvec, type, can_swap)) {
|
||||
+ spin_unlock_irq(&lruvec->lru_lock);
|
||||
+ cond_resched();
|
||||
+ spin_lock_irq(&lruvec->lru_lock);
|
||||
+ }
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -4072,7 +4105,7 @@ static void inc_max_seq(struct lruvec *l
|
||||
}
|
||||
|
||||
static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
|
||||
- struct scan_control *sc, bool can_swap)
|
||||
+ struct scan_control *sc, bool can_swap, bool force_scan)
|
||||
{
|
||||
bool success;
|
||||
struct lru_gen_mm_walk *walk;
|
||||
@@ -4093,7 +4126,7 @@ static bool try_to_inc_max_seq(struct lr
|
||||
* handful of PTEs. Spreading the work out over a period of time usually
|
||||
* is less efficient, but it avoids bursty page faults.
|
||||
*/
|
||||
- if (!(arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK))) {
|
||||
+ if (!force_scan && !(arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK))) {
|
||||
success = iterate_mm_list_nowalk(lruvec, max_seq);
|
||||
goto done;
|
||||
}
|
||||
@@ -4107,7 +4140,7 @@ static bool try_to_inc_max_seq(struct lr
|
||||
walk->lruvec = lruvec;
|
||||
walk->max_seq = max_seq;
|
||||
walk->can_swap = can_swap;
|
||||
- walk->force_scan = false;
|
||||
+ walk->force_scan = force_scan;
|
||||
|
||||
do {
|
||||
success = iterate_mm_list(lruvec, walk, &mm);
|
||||
@@ -4127,7 +4160,7 @@ done:
|
||||
|
||||
VM_WARN_ON_ONCE(max_seq != READ_ONCE(lrugen->max_seq));
|
||||
|
||||
- inc_max_seq(lruvec, can_swap);
|
||||
+ inc_max_seq(lruvec, can_swap, force_scan);
|
||||
/* either this sees any waiters or they will see updated max_seq */
|
||||
if (wq_has_sleeper(&lruvec->mm_state.wait))
|
||||
wake_up_all(&lruvec->mm_state.wait);
|
||||
@@ -4225,7 +4258,7 @@ static bool age_lruvec(struct lruvec *lr
|
||||
}
|
||||
|
||||
if (need_aging)
|
||||
- try_to_inc_max_seq(lruvec, max_seq, sc, swappiness);
|
||||
+ try_to_inc_max_seq(lruvec, max_seq, sc, swappiness, false);
|
||||
|
||||
return true;
|
||||
}
|
||||
@@ -4784,7 +4817,7 @@ static unsigned long get_nr_to_scan(stru
|
||||
if (current_is_kswapd())
|
||||
return 0;
|
||||
|
||||
- if (try_to_inc_max_seq(lruvec, max_seq, sc, can_swap))
|
||||
+ if (try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false))
|
||||
return nr_to_scan;
|
||||
done:
|
||||
return min_seq[!can_swap] + MIN_NR_GENS <= max_seq ? nr_to_scan : 0;
|
||||
@@ -5124,6 +5157,361 @@ static struct attribute_group lru_gen_at
|
||||
};
|
||||
|
||||
/******************************************************************************
|
||||
+ * debugfs interface
|
||||
+ ******************************************************************************/
|
||||
+
|
||||
+static void *lru_gen_seq_start(struct seq_file *m, loff_t *pos)
|
||||
+{
|
||||
+ struct mem_cgroup *memcg;
|
||||
+ loff_t nr_to_skip = *pos;
|
||||
+
|
||||
+ m->private = kvmalloc(PATH_MAX, GFP_KERNEL);
|
||||
+ if (!m->private)
|
||||
+ return ERR_PTR(-ENOMEM);
|
||||
+
|
||||
+ memcg = mem_cgroup_iter(NULL, NULL, NULL);
|
||||
+ do {
|
||||
+ int nid;
|
||||
+
|
||||
+ for_each_node_state(nid, N_MEMORY) {
|
||||
+ if (!nr_to_skip--)
|
||||
+ return get_lruvec(memcg, nid);
|
||||
+ }
|
||||
+ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
|
||||
+
|
||||
+ return NULL;
|
||||
+}
|
||||
+
|
||||
+static void lru_gen_seq_stop(struct seq_file *m, void *v)
|
||||
+{
|
||||
+ if (!IS_ERR_OR_NULL(v))
|
||||
+ mem_cgroup_iter_break(NULL, lruvec_memcg(v));
|
||||
+
|
||||
+ kvfree(m->private);
|
||||
+ m->private = NULL;
|
||||
+}
|
||||
+
|
||||
+static void *lru_gen_seq_next(struct seq_file *m, void *v, loff_t *pos)
|
||||
+{
|
||||
+ int nid = lruvec_pgdat(v)->node_id;
|
||||
+ struct mem_cgroup *memcg = lruvec_memcg(v);
|
||||
+
|
||||
+ ++*pos;
|
||||
+
|
||||
+ nid = next_memory_node(nid);
|
||||
+ if (nid == MAX_NUMNODES) {
|
||||
+ memcg = mem_cgroup_iter(NULL, memcg, NULL);
|
||||
+ if (!memcg)
|
||||
+ return NULL;
|
||||
+
|
||||
+ nid = first_memory_node;
|
||||
+ }
|
||||
+
|
||||
+ return get_lruvec(memcg, nid);
|
||||
+}
|
||||
+
|
||||
+static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec,
|
||||
+ unsigned long max_seq, unsigned long *min_seq,
|
||||
+ unsigned long seq)
|
||||
+{
|
||||
+ int i;
|
||||
+ int type, tier;
|
||||
+ int hist = lru_hist_from_seq(seq);
|
||||
+ struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
||||
+
|
||||
+ for (tier = 0; tier < MAX_NR_TIERS; tier++) {
|
||||
+ seq_printf(m, " %10d", tier);
|
||||
+ for (type = 0; type < ANON_AND_FILE; type++) {
|
||||
+ const char *s = " ";
|
||||
+ unsigned long n[3] = {};
|
||||
+
|
||||
+ if (seq == max_seq) {
|
||||
+ s = "RT ";
|
||||
+ n[0] = READ_ONCE(lrugen->avg_refaulted[type][tier]);
|
||||
+ n[1] = READ_ONCE(lrugen->avg_total[type][tier]);
|
||||
+ } else if (seq == min_seq[type] || NR_HIST_GENS > 1) {
|
||||
+ s = "rep";
|
||||
+ n[0] = atomic_long_read(&lrugen->refaulted[hist][type][tier]);
|
||||
+ n[1] = atomic_long_read(&lrugen->evicted[hist][type][tier]);
|
||||
+ if (tier)
|
||||
+ n[2] = READ_ONCE(lrugen->protected[hist][type][tier - 1]);
|
||||
+ }
|
||||
+
|
||||
+ for (i = 0; i < 3; i++)
|
||||
+ seq_printf(m, " %10lu%c", n[i], s[i]);
|
||||
+ }
|
||||
+ seq_putc(m, '\n');
|
||||
+ }
|
||||
+
|
||||
+ seq_puts(m, " ");
|
||||
+ for (i = 0; i < NR_MM_STATS; i++) {
|
||||
+ const char *s = " ";
|
||||
+ unsigned long n = 0;
|
||||
+
|
||||
+ if (seq == max_seq && NR_HIST_GENS == 1) {
|
||||
+ s = "LOYNFA";
|
||||
+ n = READ_ONCE(lruvec->mm_state.stats[hist][i]);
|
||||
+ } else if (seq != max_seq && NR_HIST_GENS > 1) {
|
||||
+ s = "loynfa";
|
||||
+ n = READ_ONCE(lruvec->mm_state.stats[hist][i]);
|
||||
+ }
|
||||
+
|
||||
+ seq_printf(m, " %10lu%c", n, s[i]);
|
||||
+ }
|
||||
+ seq_putc(m, '\n');
|
||||
+}
|
||||
+
|
||||
+static int lru_gen_seq_show(struct seq_file *m, void *v)
|
||||
+{
|
||||
+ unsigned long seq;
|
||||
+ bool full = !debugfs_real_fops(m->file)->write;
|
||||
+ struct lruvec *lruvec = v;
|
||||
+ struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
||||
+ int nid = lruvec_pgdat(lruvec)->node_id;
|
||||
+ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
|
||||
+ DEFINE_MAX_SEQ(lruvec);
|
||||
+ DEFINE_MIN_SEQ(lruvec);
|
||||
+
|
||||
+ if (nid == first_memory_node) {
|
||||
+ const char *path = memcg ? m->private : "";
|
||||
+
|
||||
+#ifdef CONFIG_MEMCG
|
||||
+ if (memcg)
|
||||
+ cgroup_path(memcg->css.cgroup, m->private, PATH_MAX);
|
||||
+#endif
|
||||
+ seq_printf(m, "memcg %5hu %s\n", mem_cgroup_id(memcg), path);
|
||||
+ }
|
||||
+
|
||||
+ seq_printf(m, " node %5d\n", nid);
|
||||
+
|
||||
+ if (!full)
|
||||
+ seq = min_seq[LRU_GEN_ANON];
|
||||
+ else if (max_seq >= MAX_NR_GENS)
|
||||
+ seq = max_seq - MAX_NR_GENS + 1;
|
||||
+ else
|
||||
+ seq = 0;
|
||||
+
|
||||
+ for (; seq <= max_seq; seq++) {
|
||||
+ int type, zone;
|
||||
+ int gen = lru_gen_from_seq(seq);
|
||||
+ unsigned long birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);
|
||||
+
|
||||
+ seq_printf(m, " %10lu %10u", seq, jiffies_to_msecs(jiffies - birth));
|
||||
+
|
||||
+ for (type = 0; type < ANON_AND_FILE; type++) {
|
||||
+ unsigned long size = 0;
|
||||
+ char mark = full && seq < min_seq[type] ? 'x' : ' ';
|
||||
+
|
||||
+ for (zone = 0; zone < MAX_NR_ZONES; zone++)
|
||||
+ size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L);
|
||||
+
|
||||
+ seq_printf(m, " %10lu%c", size, mark);
|
||||
+ }
|
||||
+
|
||||
+ seq_putc(m, '\n');
|
||||
+
|
||||
+ if (full)
|
||||
+ lru_gen_seq_show_full(m, lruvec, max_seq, min_seq, seq);
|
||||
+ }
|
||||
+
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
+static const struct seq_operations lru_gen_seq_ops = {
|
||||
+ .start = lru_gen_seq_start,
|
||||
+ .stop = lru_gen_seq_stop,
|
||||
+ .next = lru_gen_seq_next,
|
||||
+ .show = lru_gen_seq_show,
|
||||
+};
|
||||
+
|
||||
+static int run_aging(struct lruvec *lruvec, unsigned long seq, struct scan_control *sc,
|
||||
+ bool can_swap, bool force_scan)
|
||||
+{
|
||||
+ DEFINE_MAX_SEQ(lruvec);
|
||||
+ DEFINE_MIN_SEQ(lruvec);
|
||||
+
|
||||
+ if (seq < max_seq)
|
||||
+ return 0;
|
||||
+
|
||||
+ if (seq > max_seq)
|
||||
+ return -EINVAL;
|
||||
+
|
||||
+ if (!force_scan && min_seq[!can_swap] + MAX_NR_GENS - 1 <= max_seq)
|
||||
+ return -ERANGE;
|
||||
+
|
||||
+ try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, force_scan);
|
||||
+
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
+static int run_eviction(struct lruvec *lruvec, unsigned long seq, struct scan_control *sc,
|
||||
+ int swappiness, unsigned long nr_to_reclaim)
|
||||
+{
|
||||
+ DEFINE_MAX_SEQ(lruvec);
|
||||
+
|
||||
+ if (seq + MIN_NR_GENS > max_seq)
|
||||
+ return -EINVAL;
|
||||
+
|
||||
+ sc->nr_reclaimed = 0;
|
||||
+
|
||||
+ while (!signal_pending(current)) {
|
||||
+ DEFINE_MIN_SEQ(lruvec);
|
||||
+
|
||||
+ if (seq < min_seq[!swappiness])
|
||||
+ return 0;
|
||||
+
|
||||
+ if (sc->nr_reclaimed >= nr_to_reclaim)
|
||||
+ return 0;
|
||||
+
|
||||
+ if (!evict_pages(lruvec, sc, swappiness, NULL))
|
||||
+ return 0;
|
||||
+
|
||||
+ cond_resched();
|
||||
+ }
|
||||
+
|
||||
+ return -EINTR;
|
||||
+}
|
||||
+
|
||||
+static int run_cmd(char cmd, int memcg_id, int nid, unsigned long seq,
|
||||
+ struct scan_control *sc, int swappiness, unsigned long opt)
|
||||
+{
|
||||
+ struct lruvec *lruvec;
|
||||
+ int err = -EINVAL;
|
||||
+ struct mem_cgroup *memcg = NULL;
|
||||
+
|
||||
+ if (nid < 0 || nid >= MAX_NUMNODES || !node_state(nid, N_MEMORY))
|
||||
+ return -EINVAL;
|
||||
+
|
||||
+ if (!mem_cgroup_disabled()) {
|
||||
+ rcu_read_lock();
|
||||
+ memcg = mem_cgroup_from_id(memcg_id);
|
||||
+#ifdef CONFIG_MEMCG
|
||||
+ if (memcg && !css_tryget(&memcg->css))
|
||||
+ memcg = NULL;
|
||||
+#endif
|
||||
+ rcu_read_unlock();
|
||||
+
|
||||
+ if (!memcg)
|
||||
+ return -EINVAL;
|
||||
+ }
|
||||
+
|
||||
+ if (memcg_id != mem_cgroup_id(memcg))
|
||||
+ goto done;
|
||||
+
|
||||
+ lruvec = get_lruvec(memcg, nid);
|
||||
+
|
||||
+ if (swappiness < 0)
|
||||
+ swappiness = get_swappiness(lruvec, sc);
|
||||
+ else if (swappiness > 200)
|
||||
+ goto done;
|
||||
+
|
||||
+ switch (cmd) {
|
||||
+ case '+':
|
||||
+ err = run_aging(lruvec, seq, sc, swappiness, opt);
|
||||
+ break;
|
||||
+ case '-':
|
||||
+ err = run_eviction(lruvec, seq, sc, swappiness, opt);
|
||||
+ break;
|
||||
+ }
|
||||
+done:
|
||||
+ mem_cgroup_put(memcg);
|
||||
+
|
||||
+ return err;
|
||||
+}
|
||||
+
|
||||
+static ssize_t lru_gen_seq_write(struct file *file, const char __user *src,
|
||||
+ size_t len, loff_t *pos)
|
||||
+{
|
||||
+ void *buf;
|
||||
+ char *cur, *next;
|
||||
+ unsigned int flags;
|
||||
+ struct blk_plug plug;
|
||||
+ int err = -EINVAL;
|
||||
+ struct scan_control sc = {
|
||||
+ .may_writepage = true,
|
||||
+ .may_unmap = true,
|
||||
+ .may_swap = true,
|
||||
+ .reclaim_idx = MAX_NR_ZONES - 1,
|
||||
+ .gfp_mask = GFP_KERNEL,
|
||||
+ };
|
||||
+
|
||||
+ buf = kvmalloc(len + 1, GFP_KERNEL);
|
||||
+ if (!buf)
|
||||
+ return -ENOMEM;
|
||||
+
|
||||
+ if (copy_from_user(buf, src, len)) {
|
||||
+ kvfree(buf);
|
||||
+ return -EFAULT;
|
||||
+ }
|
||||
+
|
||||
+ set_task_reclaim_state(current, &sc.reclaim_state);
|
||||
+ flags = memalloc_noreclaim_save();
|
||||
+ blk_start_plug(&plug);
|
||||
+ if (!set_mm_walk(NULL)) {
|
||||
+ err = -ENOMEM;
|
||||
+ goto done;
|
||||
+ }
|
||||
+
|
||||
+ next = buf;
|
||||
+ next[len] = '\0';
|
||||
+
|
||||
+ while ((cur = strsep(&next, ",;\n"))) {
|
||||
+ int n;
|
||||
+ int end;
|
||||
+ char cmd;
|
||||
+ unsigned int memcg_id;
|
||||
+ unsigned int nid;
|
||||
+ unsigned long seq;
|
||||
+ unsigned int swappiness = -1;
|
||||
+ unsigned long opt = -1;
|
||||
+
|
||||
+ cur = skip_spaces(cur);
|
||||
+ if (!*cur)
|
||||
+ continue;
|
||||
+
|
||||
+ n = sscanf(cur, "%c %u %u %lu %n %u %n %lu %n", &cmd, &memcg_id, &nid,
|
||||
+ &seq, &end, &swappiness, &end, &opt, &end);
|
||||
+ if (n < 4 || cur[end]) {
|
||||
+ err = -EINVAL;
|
||||
+ break;
|
||||
+ }
|
||||
+
|
||||
+ err = run_cmd(cmd, memcg_id, nid, seq, &sc, swappiness, opt);
|
||||
+ if (err)
|
||||
+ break;
|
||||
+ }
|
||||
+done:
|
||||
+ clear_mm_walk();
|
||||
+ blk_finish_plug(&plug);
|
||||
+ memalloc_noreclaim_restore(flags);
|
||||
+ set_task_reclaim_state(current, NULL);
|
||||
+
|
||||
+ kvfree(buf);
|
||||
+
|
||||
+ return err ? : len;
|
||||
+}
|
||||
+
|
||||
+static int lru_gen_seq_open(struct inode *inode, struct file *file)
|
||||
+{
|
||||
+ return seq_open(file, &lru_gen_seq_ops);
|
||||
+}
|
||||
+
|
||||
+static const struct file_operations lru_gen_rw_fops = {
|
||||
+ .open = lru_gen_seq_open,
|
||||
+ .read = seq_read,
|
||||
+ .write = lru_gen_seq_write,
|
||||
+ .llseek = seq_lseek,
|
||||
+ .release = seq_release,
|
||||
+};
|
||||
+
|
||||
+static const struct file_operations lru_gen_ro_fops = {
|
||||
+ .open = lru_gen_seq_open,
|
||||
+ .read = seq_read,
|
||||
+ .llseek = seq_lseek,
|
||||
+ .release = seq_release,
|
||||
+};
|
||||
+
|
||||
+/******************************************************************************
|
||||
* initialization
|
||||
******************************************************************************/
|
||||
|
||||
@@ -5180,6 +5568,9 @@ static int __init init_lru_gen(void)
|
||||
if (sysfs_create_group(mm_kobj, &lru_gen_attr_group))
|
||||
pr_err("lru_gen: failed to create sysfs group\n");
|
||||
|
||||
+ debugfs_create_file("lru_gen", 0644, NULL, NULL, &lru_gen_rw_fops);
|
||||
+ debugfs_create_file("lru_gen_full", 0444, NULL, NULL, &lru_gen_ro_fops);
|
||||
+
|
||||
return 0;
|
||||
};
|
||||
late_initcall(init_lru_gen);
|
@ -0,0 +1,32 @@
|
||||
From 92d430e8955c976eacb7cc91d7ff849c0dd009af Mon Sep 17 00:00:00 2001
|
||||
From: Yu Zhao <yuzhao@google.com>
|
||||
Date: Wed, 28 Sep 2022 13:36:58 -0600
|
||||
Subject: [PATCH 13/29] mm/mglru: don't sync disk for each aging cycle
|
||||
|
||||
wakeup_flusher_threads() was added under the assumption that if a system
|
||||
runs out of clean cold pages, it might want to write back dirty pages more
|
||||
aggressively so that they can become clean and be dropped.
|
||||
|
||||
However, doing so can breach the rate limit a system wants to impose on
|
||||
writeback, resulting in early SSD wearout.
|
||||
|
||||
Link: https://lkml.kernel.org/r/YzSiWq9UEER5LKup@google.com
|
||||
Fixes: bd74fdaea146 ("mm: multi-gen LRU: support page table walks")
|
||||
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
||||
Reported-by: Axel Rasmussen <axelrasmussen@google.com>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
mm/vmscan.c | 2 --
|
||||
1 file changed, 2 deletions(-)
|
||||
|
||||
--- a/mm/vmscan.c
|
||||
+++ b/mm/vmscan.c
|
||||
@@ -4165,8 +4165,6 @@ done:
|
||||
if (wq_has_sleeper(&lruvec->mm_state.wait))
|
||||
wake_up_all(&lruvec->mm_state.wait);
|
||||
|
||||
- wakeup_flusher_threads(WB_REASON_VMSCAN);
|
||||
-
|
||||
return true;
|
||||
}
|
||||
|
@ -0,0 +1,124 @@
|
||||
From 6f315879ad750391a0b1fab8c9170bc054a5f5d7 Mon Sep 17 00:00:00 2001
|
||||
From: Yu Zhao <yuzhao@google.com>
|
||||
Date: Tue, 15 Nov 2022 18:38:07 -0700
|
||||
Subject: [PATCH 14/29] mm: multi-gen LRU: retry pages written back while
|
||||
isolated
|
||||
|
||||
The page reclaim isolates a batch of pages from the tail of one of the
|
||||
LRU lists and works on those pages one by one. For a suitable
|
||||
swap-backed page, if the swap device is async, it queues that page for
|
||||
writeback. After the page reclaim finishes an entire batch, it puts back
|
||||
the pages it queued for writeback to the head of the original LRU list.
|
||||
|
||||
In the meantime, the page writeback flushes the queued pages also by
|
||||
batches. Its batching logic is independent from that of the page reclaim.
|
||||
For each of the pages it writes back, the page writeback calls
|
||||
rotate_reclaimable_page() which tries to rotate a page to the tail.
|
||||
|
||||
rotate_reclaimable_page() only works for a page after the page reclaim
|
||||
has put it back. If an async swap device is fast enough, the page
|
||||
writeback can finish with that page while the page reclaim is still
|
||||
working on the rest of the batch containing it. In this case, that page
|
||||
will remain at the head and the page reclaim will not retry it before
|
||||
reaching there.
|
||||
|
||||
This patch adds a retry to evict_pages(). After evict_pages() has
|
||||
finished an entire batch and before it puts back pages it cannot free
|
||||
immediately, it retries those that may have missed the rotation.
|
||||
|
||||
Before this patch, ~60% of pages swapped to an Intel Optane missed
|
||||
rotate_reclaimable_page(). After this patch, ~99% of missed pages were
|
||||
reclaimed upon retry.
|
||||
|
||||
This problem affects relatively slow async swap devices like Samsung 980
|
||||
Pro much less and does not affect sync swap devices like zram or zswap at
|
||||
all.
|
||||
|
||||
Link: https://lkml.kernel.org/r/20221116013808.3995280-1-yuzhao@google.com
|
||||
Fixes: ac35a4902374 ("mm: multi-gen LRU: minimal implementation")
|
||||
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
||||
Cc: "Yin, Fengwei" <fengwei.yin@intel.com>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
mm/vmscan.c | 48 +++++++++++++++++++++++++++++++++++++-----------
|
||||
1 file changed, 37 insertions(+), 11 deletions(-)
|
||||
|
||||
--- a/mm/vmscan.c
|
||||
+++ b/mm/vmscan.c
|
||||
@@ -4723,10 +4723,13 @@ static int evict_pages(struct lruvec *lr
|
||||
int scanned;
|
||||
int reclaimed;
|
||||
LIST_HEAD(list);
|
||||
+ LIST_HEAD(clean);
|
||||
struct page *page;
|
||||
+ struct page *next;
|
||||
enum vm_event_item item;
|
||||
struct reclaim_stat stat;
|
||||
struct lru_gen_mm_walk *walk;
|
||||
+ bool skip_retry = false;
|
||||
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
|
||||
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
|
||||
|
||||
@@ -4743,20 +4746,37 @@ static int evict_pages(struct lruvec *lr
|
||||
|
||||
if (list_empty(&list))
|
||||
return scanned;
|
||||
-
|
||||
+retry:
|
||||
reclaimed = shrink_page_list(&list, pgdat, sc, &stat, false);
|
||||
+ sc->nr_reclaimed += reclaimed;
|
||||
|
||||
- list_for_each_entry(page, &list, lru) {
|
||||
- /* restore LRU_REFS_FLAGS cleared by isolate_page() */
|
||||
- if (PageWorkingset(page))
|
||||
- SetPageReferenced(page);
|
||||
+ list_for_each_entry_safe_reverse(page, next, &list, lru) {
|
||||
+ if (!page_evictable(page)) {
|
||||
+ list_del(&page->lru);
|
||||
+ putback_lru_page(page);
|
||||
+ continue;
|
||||
+ }
|
||||
|
||||
- /* don't add rejected pages to the oldest generation */
|
||||
if (PageReclaim(page) &&
|
||||
- (PageDirty(page) || PageWriteback(page)))
|
||||
- ClearPageActive(page);
|
||||
- else
|
||||
- SetPageActive(page);
|
||||
+ (PageDirty(page) || PageWriteback(page))) {
|
||||
+ /* restore LRU_REFS_FLAGS cleared by isolate_page() */
|
||||
+ if (PageWorkingset(page))
|
||||
+ SetPageReferenced(page);
|
||||
+ continue;
|
||||
+ }
|
||||
+
|
||||
+ if (skip_retry || PageActive(page) || PageReferenced(page) ||
|
||||
+ page_mapped(page) || PageLocked(page) ||
|
||||
+ PageDirty(page) || PageWriteback(page)) {
|
||||
+ /* don't add rejected pages to the oldest generation */
|
||||
+ set_mask_bits(&page->flags, LRU_REFS_MASK | LRU_REFS_FLAGS,
|
||||
+ BIT(PG_active));
|
||||
+ continue;
|
||||
+ }
|
||||
+
|
||||
+ /* retry pages that may have missed rotate_reclaimable_page() */
|
||||
+ list_move(&page->lru, &clean);
|
||||
+ sc->nr_scanned -= thp_nr_pages(page);
|
||||
}
|
||||
|
||||
spin_lock_irq(&lruvec->lru_lock);
|
||||
@@ -4778,7 +4798,13 @@ static int evict_pages(struct lruvec *lr
|
||||
mem_cgroup_uncharge_list(&list);
|
||||
free_unref_page_list(&list);
|
||||
|
||||
- sc->nr_reclaimed += reclaimed;
|
||||
+ INIT_LIST_HEAD(&list);
|
||||
+ list_splice_init(&clean, &list);
|
||||
+
|
||||
+ if (!list_empty(&list)) {
|
||||
+ skip_retry = true;
|
||||
+ goto retry;
|
||||
+ }
|
||||
|
||||
if (need_swapping && type == LRU_GEN_ANON)
|
||||
*need_swapping = true;
|
@ -0,0 +1,49 @@
|
||||
From 255bb0ac393f1c2818cd75af45a9226300ab3daf Mon Sep 17 00:00:00 2001
|
||||
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
|
||||
Date: Wed, 26 Oct 2022 15:48:30 +0200
|
||||
Subject: [PATCH 15/29] mm: multi-gen LRU: move lru_gen_add_mm() out of IRQ-off
|
||||
region
|
||||
|
||||
lru_gen_add_mm() has been added within an IRQ-off region in the commit
|
||||
mentioned below. The other invocations of lru_gen_add_mm() are not within
|
||||
an IRQ-off region.
|
||||
|
||||
The invocation within IRQ-off region is problematic on PREEMPT_RT because
|
||||
the function is using a spin_lock_t which must not be used within
|
||||
IRQ-disabled regions.
|
||||
|
||||
The other invocations of lru_gen_add_mm() occur while
|
||||
task_struct::alloc_lock is acquired. Move lru_gen_add_mm() after
|
||||
interrupts are enabled and before task_unlock().
|
||||
|
||||
Link: https://lkml.kernel.org/r/20221026134830.711887-1-bigeasy@linutronix.de
|
||||
Fixes: bd74fdaea1460 ("mm: multi-gen LRU: support page table walks")
|
||||
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
|
||||
Acked-by: Yu Zhao <yuzhao@google.com>
|
||||
Cc: Al Viro <viro@zeniv.linux.org.uk>
|
||||
Cc: "Eric W . Biederman" <ebiederm@xmission.com>
|
||||
Cc: Kees Cook <keescook@chromium.org>
|
||||
Cc: Thomas Gleixner <tglx@linutronix.de>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
fs/exec.c | 2 +-
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
--- a/fs/exec.c
|
||||
+++ b/fs/exec.c
|
||||
@@ -1013,7 +1013,6 @@ static int exec_mmap(struct mm_struct *m
|
||||
active_mm = tsk->active_mm;
|
||||
tsk->active_mm = mm;
|
||||
tsk->mm = mm;
|
||||
- lru_gen_add_mm(mm);
|
||||
/*
|
||||
* This prevents preemption while active_mm is being loaded and
|
||||
* it and mm are being updated, which could cause problems for
|
||||
@@ -1028,6 +1027,7 @@ static int exec_mmap(struct mm_struct *m
|
||||
local_irq_enable();
|
||||
tsk->mm->vmacache_seqnum = 0;
|
||||
vmacache_flush(tsk);
|
||||
+ lru_gen_add_mm(mm);
|
||||
task_unlock(tsk);
|
||||
lru_gen_use_mm(mm);
|
||||
if (old_mm) {
|
@ -0,0 +1,96 @@
|
||||
From c5ec455ebd2b488d91de9d8915a0c8036a2a04dd Mon Sep 17 00:00:00 2001
|
||||
From: Juergen Gross <jgross@suse.com>
|
||||
Date: Wed, 30 Nov 2022 14:49:41 -0800
|
||||
Subject: [PATCH 17/29] mm: add dummy pmd_young() for architectures not having
|
||||
it
|
||||
|
||||
In order to avoid #ifdeffery add a dummy pmd_young() implementation as a
|
||||
fallback. This is required for the later patch "mm: introduce
|
||||
arch_has_hw_nonleaf_pmd_young()".
|
||||
|
||||
Link: https://lkml.kernel.org/r/fd3ac3cd-7349-6bbd-890a-71a9454ca0b3@suse.com
|
||||
Signed-off-by: Juergen Gross <jgross@suse.com>
|
||||
Acked-by: Yu Zhao <yuzhao@google.com>
|
||||
Cc: Borislav Petkov <bp@alien8.de>
|
||||
Cc: Dave Hansen <dave.hansen@linux.intel.com>
|
||||
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
|
||||
Cc: "H. Peter Anvin" <hpa@zytor.com>
|
||||
Cc: Ingo Molnar <mingo@redhat.com>
|
||||
Cc: Sander Eikelenboom <linux@eikelenboom.it>
|
||||
Cc: Thomas Gleixner <tglx@linutronix.de>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
arch/mips/include/asm/pgtable.h | 1 +
|
||||
arch/riscv/include/asm/pgtable.h | 1 +
|
||||
arch/s390/include/asm/pgtable.h | 1 +
|
||||
arch/sparc/include/asm/pgtable_64.h | 1 +
|
||||
arch/x86/include/asm/pgtable.h | 1 +
|
||||
include/linux/pgtable.h | 7 +++++++
|
||||
6 files changed, 12 insertions(+)
|
||||
|
||||
--- a/arch/mips/include/asm/pgtable.h
|
||||
+++ b/arch/mips/include/asm/pgtable.h
|
||||
@@ -632,6 +632,7 @@ static inline pmd_t pmd_mkdirty(pmd_t pm
|
||||
return pmd;
|
||||
}
|
||||
|
||||
+#define pmd_young pmd_young
|
||||
static inline int pmd_young(pmd_t pmd)
|
||||
{
|
||||
return !!(pmd_val(pmd) & _PAGE_ACCESSED);
|
||||
--- a/arch/riscv/include/asm/pgtable.h
|
||||
+++ b/arch/riscv/include/asm/pgtable.h
|
||||
@@ -535,6 +535,7 @@ static inline int pmd_dirty(pmd_t pmd)
|
||||
return pte_dirty(pmd_pte(pmd));
|
||||
}
|
||||
|
||||
+#define pmd_young pmd_young
|
||||
static inline int pmd_young(pmd_t pmd)
|
||||
{
|
||||
return pte_young(pmd_pte(pmd));
|
||||
--- a/arch/s390/include/asm/pgtable.h
|
||||
+++ b/arch/s390/include/asm/pgtable.h
|
||||
@@ -748,6 +748,7 @@ static inline int pmd_dirty(pmd_t pmd)
|
||||
return (pmd_val(pmd) & _SEGMENT_ENTRY_DIRTY) != 0;
|
||||
}
|
||||
|
||||
+#define pmd_young pmd_young
|
||||
static inline int pmd_young(pmd_t pmd)
|
||||
{
|
||||
return (pmd_val(pmd) & _SEGMENT_ENTRY_YOUNG) != 0;
|
||||
--- a/arch/sparc/include/asm/pgtable_64.h
|
||||
+++ b/arch/sparc/include/asm/pgtable_64.h
|
||||
@@ -712,6 +712,7 @@ static inline unsigned long pmd_dirty(pm
|
||||
return pte_dirty(pte);
|
||||
}
|
||||
|
||||
+#define pmd_young pmd_young
|
||||
static inline unsigned long pmd_young(pmd_t pmd)
|
||||
{
|
||||
pte_t pte = __pte(pmd_val(pmd));
|
||||
--- a/arch/x86/include/asm/pgtable.h
|
||||
+++ b/arch/x86/include/asm/pgtable.h
|
||||
@@ -136,6 +136,7 @@ static inline int pmd_dirty(pmd_t pmd)
|
||||
return pmd_flags(pmd) & _PAGE_DIRTY;
|
||||
}
|
||||
|
||||
+#define pmd_young pmd_young
|
||||
static inline int pmd_young(pmd_t pmd)
|
||||
{
|
||||
return pmd_flags(pmd) & _PAGE_ACCESSED;
|
||||
--- a/include/linux/pgtable.h
|
||||
+++ b/include/linux/pgtable.h
|
||||
@@ -164,6 +164,13 @@ static inline pte_t *virt_to_kpte(unsign
|
||||
return pmd_none(*pmd) ? NULL : pte_offset_kernel(pmd, vaddr);
|
||||
}
|
||||
|
||||
+#ifndef pmd_young
|
||||
+static inline int pmd_young(pmd_t pmd)
|
||||
+{
|
||||
+ return 0;
|
||||
+}
|
||||
+#endif
|
||||
+
|
||||
#ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
|
||||
extern int ptep_set_access_flags(struct vm_area_struct *vma,
|
||||
unsigned long address, pte_t *ptep,
|
@ -0,0 +1,113 @@
|
||||
From 46cbda7b65998a5af4493f745d94417af697bd68 Mon Sep 17 00:00:00 2001
|
||||
From: Juergen Gross <jgross@suse.com>
|
||||
Date: Wed, 23 Nov 2022 07:45:10 +0100
|
||||
Subject: [PATCH 18/29] mm: introduce arch_has_hw_nonleaf_pmd_young()
|
||||
|
||||
When running as a Xen PV guests commit eed9a328aa1a ("mm: x86: add
|
||||
CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG") can cause a protection violation in
|
||||
pmdp_test_and_clear_young():
|
||||
|
||||
BUG: unable to handle page fault for address: ffff8880083374d0
|
||||
#PF: supervisor write access in kernel mode
|
||||
#PF: error_code(0x0003) - permissions violation
|
||||
PGD 3026067 P4D 3026067 PUD 3027067 PMD 7fee5067 PTE 8010000008337065
|
||||
Oops: 0003 [#1] PREEMPT SMP NOPTI
|
||||
CPU: 7 PID: 158 Comm: kswapd0 Not tainted 6.1.0-rc5-20221118-doflr+ #1
|
||||
RIP: e030:pmdp_test_and_clear_young+0x25/0x40
|
||||
|
||||
This happens because the Xen hypervisor can't emulate direct writes to
|
||||
page table entries other than PTEs.
|
||||
|
||||
This can easily be fixed by introducing arch_has_hw_nonleaf_pmd_young()
|
||||
similar to arch_has_hw_pte_young() and test that instead of
|
||||
CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG.
|
||||
|
||||
Link: https://lkml.kernel.org/r/20221123064510.16225-1-jgross@suse.com
|
||||
Fixes: eed9a328aa1a ("mm: x86: add CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG")
|
||||
Signed-off-by: Juergen Gross <jgross@suse.com>
|
||||
Reported-by: Sander Eikelenboom <linux@eikelenboom.it>
|
||||
Acked-by: Yu Zhao <yuzhao@google.com>
|
||||
Tested-by: Sander Eikelenboom <linux@eikelenboom.it>
|
||||
Acked-by: David Hildenbrand <david@redhat.com> [core changes]
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
arch/x86/include/asm/pgtable.h | 8 ++++++++
|
||||
include/linux/pgtable.h | 11 +++++++++++
|
||||
mm/vmscan.c | 10 +++++-----
|
||||
3 files changed, 24 insertions(+), 5 deletions(-)
|
||||
|
||||
--- a/arch/x86/include/asm/pgtable.h
|
||||
+++ b/arch/x86/include/asm/pgtable.h
|
||||
@@ -1405,6 +1405,14 @@ static inline bool arch_has_hw_pte_young
|
||||
return true;
|
||||
}
|
||||
|
||||
+#ifdef CONFIG_XEN_PV
|
||||
+#define arch_has_hw_nonleaf_pmd_young arch_has_hw_nonleaf_pmd_young
|
||||
+static inline bool arch_has_hw_nonleaf_pmd_young(void)
|
||||
+{
|
||||
+ return !cpu_feature_enabled(X86_FEATURE_XENPV);
|
||||
+}
|
||||
+#endif
|
||||
+
|
||||
#endif /* __ASSEMBLY__ */
|
||||
|
||||
#endif /* _ASM_X86_PGTABLE_H */
|
||||
--- a/include/linux/pgtable.h
|
||||
+++ b/include/linux/pgtable.h
|
||||
@@ -266,6 +266,17 @@ static inline int pmdp_clear_flush_young
|
||||
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
|
||||
#endif
|
||||
|
||||
+#ifndef arch_has_hw_nonleaf_pmd_young
|
||||
+/*
|
||||
+ * Return whether the accessed bit in non-leaf PMD entries is supported on the
|
||||
+ * local CPU.
|
||||
+ */
|
||||
+static inline bool arch_has_hw_nonleaf_pmd_young(void)
|
||||
+{
|
||||
+ return IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG);
|
||||
+}
|
||||
+#endif
|
||||
+
|
||||
#ifndef arch_has_hw_pte_young
|
||||
/*
|
||||
* Return whether the accessed bit is supported on the local CPU.
|
||||
--- a/mm/vmscan.c
|
||||
+++ b/mm/vmscan.c
|
||||
@@ -3727,7 +3727,7 @@ static void walk_pmd_range_locked(pud_t
|
||||
goto next;
|
||||
|
||||
if (!pmd_trans_huge(pmd[i])) {
|
||||
- if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) &&
|
||||
+ if (arch_has_hw_nonleaf_pmd_young() &&
|
||||
get_cap(LRU_GEN_NONLEAF_YOUNG))
|
||||
pmdp_test_and_clear_young(vma, addr, pmd + i);
|
||||
goto next;
|
||||
@@ -3825,14 +3825,14 @@ restart:
|
||||
#endif
|
||||
walk->mm_stats[MM_NONLEAF_TOTAL]++;
|
||||
|
||||
-#ifdef CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG
|
||||
- if (get_cap(LRU_GEN_NONLEAF_YOUNG)) {
|
||||
+ if (arch_has_hw_nonleaf_pmd_young() &&
|
||||
+ get_cap(LRU_GEN_NONLEAF_YOUNG)) {
|
||||
if (!pmd_young(val))
|
||||
continue;
|
||||
|
||||
walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos);
|
||||
}
|
||||
-#endif
|
||||
+
|
||||
if (!walk->force_scan && !test_bloom_filter(walk->lruvec, walk->max_seq, pmd + i))
|
||||
continue;
|
||||
|
||||
@@ -5132,7 +5132,7 @@ static ssize_t show_enabled(struct kobje
|
||||
if (arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK))
|
||||
caps |= BIT(LRU_GEN_MM_WALK);
|
||||
|
||||
- if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) && get_cap(LRU_GEN_NONLEAF_YOUNG))
|
||||
+ if (arch_has_hw_nonleaf_pmd_young() && get_cap(LRU_GEN_NONLEAF_YOUNG))
|
||||
caps |= BIT(LRU_GEN_NONLEAF_YOUNG);
|
||||
|
||||
return snprintf(buf, PAGE_SIZE, "0x%04x\n", caps);
|
@ -0,0 +1,56 @@
|
||||
From c7dfefd4bdfba3d5171038d1cc2d4160288e6ee4 Mon Sep 17 00:00:00 2001
|
||||
From: Yu Zhao <yuzhao@google.com>
|
||||
Date: Sun, 15 Jan 2023 20:44:05 -0700
|
||||
Subject: [PATCH 16/29] mm: multi-gen LRU: fix crash during cgroup migration
|
||||
|
||||
lru_gen_migrate_mm() assumes lru_gen_add_mm() runs prior to itself. This
|
||||
isn't true for the following scenario:
|
||||
|
||||
CPU 1 CPU 2
|
||||
|
||||
clone()
|
||||
cgroup_can_fork()
|
||||
cgroup_procs_write()
|
||||
cgroup_post_fork()
|
||||
task_lock()
|
||||
lru_gen_migrate_mm()
|
||||
task_unlock()
|
||||
task_lock()
|
||||
lru_gen_add_mm()
|
||||
task_unlock()
|
||||
|
||||
And when the above happens, kernel crashes because of linked list
|
||||
corruption (mm_struct->lru_gen.list).
|
||||
|
||||
Link: https://lore.kernel.org/r/20230115134651.30028-1-msizanoen@qtmlabs.xyz/
|
||||
Link: https://lkml.kernel.org/r/20230116034405.2960276-1-yuzhao@google.com
|
||||
Fixes: bd74fdaea146 ("mm: multi-gen LRU: support page table walks")
|
||||
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
||||
Reported-by: msizanoen <msizanoen@qtmlabs.xyz>
|
||||
Tested-by: msizanoen <msizanoen@qtmlabs.xyz>
|
||||
Cc: <stable@vger.kernel.org> [6.1+]
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
mm/vmscan.c | 5 ++++-
|
||||
1 file changed, 4 insertions(+), 1 deletion(-)
|
||||
|
||||
--- a/mm/vmscan.c
|
||||
+++ b/mm/vmscan.c
|
||||
@@ -3024,13 +3024,16 @@ void lru_gen_migrate_mm(struct mm_struct
|
||||
if (mem_cgroup_disabled())
|
||||
return;
|
||||
|
||||
+ /* migration can happen before addition */
|
||||
+ if (!mm->lru_gen.memcg)
|
||||
+ return;
|
||||
+
|
||||
rcu_read_lock();
|
||||
memcg = mem_cgroup_from_task(task);
|
||||
rcu_read_unlock();
|
||||
if (memcg == mm->lru_gen.memcg)
|
||||
return;
|
||||
|
||||
- VM_WARN_ON_ONCE(!mm->lru_gen.memcg);
|
||||
VM_WARN_ON_ONCE(list_empty(&mm->lru_gen.list));
|
||||
|
||||
lru_gen_del_mm(mm);
|
@ -0,0 +1,196 @@
|
||||
From 6c7f552a48b49a8612786a28a2239fbc24fac289 Mon Sep 17 00:00:00 2001
|
||||
From: Yu Zhao <yuzhao@google.com>
|
||||
Date: Fri, 30 Dec 2022 14:52:51 -0700
|
||||
Subject: [PATCH 19/29] mm: add vma_has_recency()
|
||||
|
||||
Add vma_has_recency() to indicate whether a VMA may exhibit temporal
|
||||
locality that the LRU algorithm relies on.
|
||||
|
||||
This function returns false for VMAs marked by VM_SEQ_READ or
|
||||
VM_RAND_READ. While the former flag indicates linear access, i.e., a
|
||||
special case of spatial locality, both flags indicate a lack of temporal
|
||||
locality, i.e., the reuse of an area within a relatively small duration.
|
||||
|
||||
"Recency" is chosen over "locality" to avoid confusion between temporal
|
||||
and spatial localities.
|
||||
|
||||
Before this patch, the active/inactive LRU only ignored the accessed bit
|
||||
from VMAs marked by VM_SEQ_READ. After this patch, the active/inactive
|
||||
LRU and MGLRU share the same logic: they both ignore the accessed bit if
|
||||
vma_has_recency() returns false.
|
||||
|
||||
For the active/inactive LRU, the following fio test showed a [6, 8]%
|
||||
increase in IOPS when randomly accessing mapped files under memory
|
||||
pressure.
|
||||
|
||||
kb=$(awk '/MemTotal/ { print $2 }' /proc/meminfo)
|
||||
kb=$((kb - 8*1024*1024))
|
||||
|
||||
modprobe brd rd_nr=1 rd_size=$kb
|
||||
dd if=/dev/zero of=/dev/ram0 bs=1M
|
||||
|
||||
mkfs.ext4 /dev/ram0
|
||||
mount /dev/ram0 /mnt/
|
||||
swapoff -a
|
||||
|
||||
fio --name=test --directory=/mnt/ --ioengine=mmap --numjobs=8 \
|
||||
--size=8G --rw=randrw --time_based --runtime=10m \
|
||||
--group_reporting
|
||||
|
||||
The discussion that led to this patch is here [1]. Additional test
|
||||
results are available in that thread.
|
||||
|
||||
[1] https://lore.kernel.org/r/Y31s%2FK8T85jh05wH@google.com/
|
||||
|
||||
Link: https://lkml.kernel.org/r/20221230215252.2628425-1-yuzhao@google.com
|
||||
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
||||
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
|
||||
Cc: Andrea Righi <andrea.righi@canonical.com>
|
||||
Cc: Johannes Weiner <hannes@cmpxchg.org>
|
||||
Cc: Michael Larabel <Michael@MichaelLarabel.com>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
include/linux/mm_inline.h | 9 +++++++++
|
||||
mm/memory.c | 8 ++++----
|
||||
mm/rmap.c | 42 +++++++++++++++++----------------------
|
||||
mm/vmscan.c | 5 ++++-
|
||||
4 files changed, 35 insertions(+), 29 deletions(-)
|
||||
|
||||
--- a/include/linux/mm_inline.h
|
||||
+++ b/include/linux/mm_inline.h
|
||||
@@ -333,4 +333,13 @@ static __always_inline void del_page_fro
|
||||
update_lru_size(lruvec, page_lru(page), page_zonenum(page),
|
||||
-thp_nr_pages(page));
|
||||
}
|
||||
+
|
||||
+static inline bool vma_has_recency(struct vm_area_struct *vma)
|
||||
+{
|
||||
+ if (vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ))
|
||||
+ return false;
|
||||
+
|
||||
+ return true;
|
||||
+}
|
||||
+
|
||||
#endif
|
||||
--- a/mm/memory.c
|
||||
+++ b/mm/memory.c
|
||||
@@ -41,6 +41,7 @@
|
||||
|
||||
#include <linux/kernel_stat.h>
|
||||
#include <linux/mm.h>
|
||||
+#include <linux/mm_inline.h>
|
||||
#include <linux/sched/mm.h>
|
||||
#include <linux/sched/coredump.h>
|
||||
#include <linux/sched/numa_balancing.h>
|
||||
@@ -1353,8 +1354,7 @@ again:
|
||||
force_flush = 1;
|
||||
set_page_dirty(page);
|
||||
}
|
||||
- if (pte_young(ptent) &&
|
||||
- likely(!(vma->vm_flags & VM_SEQ_READ)))
|
||||
+ if (pte_young(ptent) && likely(vma_has_recency(vma)))
|
||||
mark_page_accessed(page);
|
||||
}
|
||||
rss[mm_counter(page)]--;
|
||||
@@ -4795,8 +4795,8 @@ static inline void mm_account_fault(stru
|
||||
#ifdef CONFIG_LRU_GEN
|
||||
static void lru_gen_enter_fault(struct vm_area_struct *vma)
|
||||
{
|
||||
- /* the LRU algorithm doesn't apply to sequential or random reads */
|
||||
- current->in_lru_fault = !(vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ));
|
||||
+ /* the LRU algorithm only applies to accesses with recency */
|
||||
+ current->in_lru_fault = vma_has_recency(vma);
|
||||
}
|
||||
|
||||
static void lru_gen_exit_fault(void)
|
||||
--- a/mm/rmap.c
|
||||
+++ b/mm/rmap.c
|
||||
@@ -794,25 +794,14 @@ static bool page_referenced_one(struct p
|
||||
}
|
||||
|
||||
if (pvmw.pte) {
|
||||
- if (lru_gen_enabled() && pte_young(*pvmw.pte) &&
|
||||
- !(vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ))) {
|
||||
+ if (lru_gen_enabled() && pte_young(*pvmw.pte)) {
|
||||
lru_gen_look_around(&pvmw);
|
||||
referenced++;
|
||||
}
|
||||
|
||||
if (ptep_clear_flush_young_notify(vma, address,
|
||||
- pvmw.pte)) {
|
||||
- /*
|
||||
- * Don't treat a reference through
|
||||
- * a sequentially read mapping as such.
|
||||
- * If the page has been used in another mapping,
|
||||
- * we will catch it; if this other mapping is
|
||||
- * already gone, the unmap path will have set
|
||||
- * PG_referenced or activated the page.
|
||||
- */
|
||||
- if (likely(!(vma->vm_flags & VM_SEQ_READ)))
|
||||
- referenced++;
|
||||
- }
|
||||
+ pvmw.pte))
|
||||
+ referenced++;
|
||||
} else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
|
||||
if (pmdp_clear_flush_young_notify(vma, address,
|
||||
pvmw.pmd))
|
||||
@@ -846,7 +835,20 @@ static bool invalid_page_referenced_vma(
|
||||
struct page_referenced_arg *pra = arg;
|
||||
struct mem_cgroup *memcg = pra->memcg;
|
||||
|
||||
- if (!mm_match_cgroup(vma->vm_mm, memcg))
|
||||
+ /*
|
||||
+ * Ignore references from this mapping if it has no recency. If the
|
||||
+ * page has been used in another mapping, we will catch it; if this
|
||||
+ * other mapping is already gone, the unmap path will have set the
|
||||
+ * referenced flag or activated the page in zap_pte_range().
|
||||
+ */
|
||||
+ if (!vma_has_recency(vma))
|
||||
+ return true;
|
||||
+
|
||||
+ /*
|
||||
+ * If we are reclaiming on behalf of a cgroup, skip counting on behalf
|
||||
+ * of references from different cgroups.
|
||||
+ */
|
||||
+ if (memcg && !mm_match_cgroup(vma->vm_mm, memcg))
|
||||
return true;
|
||||
|
||||
return false;
|
||||
@@ -876,6 +878,7 @@ int page_referenced(struct page *page,
|
||||
.rmap_one = page_referenced_one,
|
||||
.arg = (void *)&pra,
|
||||
.anon_lock = page_lock_anon_vma_read,
|
||||
+ .invalid_vma = invalid_page_referenced_vma,
|
||||
};
|
||||
|
||||
*vm_flags = 0;
|
||||
@@ -891,15 +894,6 @@ int page_referenced(struct page *page,
|
||||
return 1;
|
||||
}
|
||||
|
||||
- /*
|
||||
- * If we are reclaiming on behalf of a cgroup, skip
|
||||
- * counting on behalf of references from different
|
||||
- * cgroups
|
||||
- */
|
||||
- if (memcg) {
|
||||
- rwc.invalid_vma = invalid_page_referenced_vma;
|
||||
- }
|
||||
-
|
||||
rmap_walk(page, &rwc);
|
||||
*vm_flags = pra.vm_flags;
|
||||
|
||||
--- a/mm/vmscan.c
|
||||
+++ b/mm/vmscan.c
|
||||
@@ -3486,7 +3486,10 @@ static int should_skip_vma(unsigned long
|
||||
if (is_vm_hugetlb_page(vma))
|
||||
return true;
|
||||
|
||||
- if (vma->vm_flags & (VM_LOCKED | VM_SPECIAL | VM_SEQ_READ | VM_RAND_READ))
|
||||
+ if (!vma_has_recency(vma))
|
||||
+ return true;
|
||||
+
|
||||
+ if (vma->vm_flags & (VM_LOCKED | VM_SPECIAL))
|
||||
return true;
|
||||
|
||||
if (vma == get_gate_vma(vma->vm_mm))
|
@ -0,0 +1,125 @@
|
||||
From 686c3d4f71de9e0e7a27f03a5617a712385f90cd Mon Sep 17 00:00:00 2001
|
||||
From: Yu Zhao <yuzhao@google.com>
|
||||
Date: Fri, 30 Dec 2022 14:52:52 -0700
|
||||
Subject: [PATCH 20/29] mm: support POSIX_FADV_NOREUSE
|
||||
|
||||
This patch adds POSIX_FADV_NOREUSE to vma_has_recency() so that the LRU
|
||||
algorithm can ignore access to mapped files marked by this flag.
|
||||
|
||||
The advantages of POSIX_FADV_NOREUSE are:
|
||||
1. Unlike MADV_SEQUENTIAL and MADV_RANDOM, it does not alter the
|
||||
default readahead behavior.
|
||||
2. Unlike MADV_SEQUENTIAL and MADV_RANDOM, it does not split VMAs and
|
||||
therefore does not take mmap_lock.
|
||||
3. Unlike MADV_COLD, setting it has a negligible cost, regardless of
|
||||
how many pages it affects.
|
||||
|
||||
Its limitations are:
|
||||
1. Like POSIX_FADV_RANDOM and POSIX_FADV_SEQUENTIAL, it currently does
|
||||
not support range. IOW, its scope is the entire file.
|
||||
2. It currently does not ignore access through file descriptors.
|
||||
Specifically, for the active/inactive LRU, given a file page shared
|
||||
by two users and one of them having set POSIX_FADV_NOREUSE on the
|
||||
file, this page will be activated upon the second user accessing
|
||||
it. This corner case can be covered by checking POSIX_FADV_NOREUSE
|
||||
before calling mark_page_accessed() on the read path. But it is
|
||||
considered not worth the effort.
|
||||
|
||||
There have been a few attempts to support POSIX_FADV_NOREUSE, e.g., [1].
|
||||
This time the goal is to fill a niche: a few desktop applications, e.g.,
|
||||
large file transferring and video encoding/decoding, want fast file
|
||||
streaming with mmap() rather than direct IO. Among those applications, an
|
||||
SVT-AV1 regression was reported when running with MGLRU [2]. The
|
||||
following test can reproduce that regression.
|
||||
|
||||
kb=$(awk '/MemTotal/ { print $2 }' /proc/meminfo)
|
||||
kb=$((kb - 8*1024*1024))
|
||||
|
||||
modprobe brd rd_nr=1 rd_size=$kb
|
||||
dd if=/dev/zero of=/dev/ram0 bs=1M
|
||||
|
||||
mkfs.ext4 /dev/ram0
|
||||
mount /dev/ram0 /mnt/
|
||||
swapoff -a
|
||||
|
||||
fallocate -l 8G /mnt/swapfile
|
||||
mkswap /mnt/swapfile
|
||||
swapon /mnt/swapfile
|
||||
|
||||
wget http://ultravideo.cs.tut.fi/video/Bosphorus_3840x2160_120fps_420_8bit_YUV_Y4M.7z
|
||||
7z e -o/mnt/ Bosphorus_3840x2160_120fps_420_8bit_YUV_Y4M.7z
|
||||
SvtAv1EncApp --preset 12 -w 3840 -h 2160 \
|
||||
-i /mnt/Bosphorus_3840x2160.y4m
|
||||
|
||||
For MGLRU, the following change showed a [9-11]% increase in FPS,
|
||||
which makes it on par with the active/inactive LRU.
|
||||
|
||||
patch Source/App/EncApp/EbAppMain.c <<EOF
|
||||
31a32
|
||||
> #include <fcntl.h>
|
||||
35d35
|
||||
< #include <fcntl.h> /* _O_BINARY */
|
||||
117a118
|
||||
> posix_fadvise(config->mmap.fd, 0, 0, POSIX_FADV_NOREUSE);
|
||||
EOF
|
||||
|
||||
[1] https://lore.kernel.org/r/1308923350-7932-1-git-send-email-andrea@betterlinux.com/
|
||||
[2] https://openbenchmarking.org/result/2209259-PTS-MGLRU8GB57
|
||||
|
||||
Link: https://lkml.kernel.org/r/20221230215252.2628425-2-yuzhao@google.com
|
||||
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
||||
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
|
||||
Cc: Andrea Righi <andrea.righi@canonical.com>
|
||||
Cc: Johannes Weiner <hannes@cmpxchg.org>
|
||||
Cc: Michael Larabel <Michael@MichaelLarabel.com>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
include/linux/fs.h | 2 ++
|
||||
include/linux/mm_inline.h | 3 +++
|
||||
mm/fadvise.c | 5 ++++-
|
||||
3 files changed, 9 insertions(+), 1 deletion(-)
|
||||
|
||||
--- a/include/linux/fs.h
|
||||
+++ b/include/linux/fs.h
|
||||
@@ -167,6 +167,8 @@ typedef int (dio_iodone_t)(struct kiocb
|
||||
/* File is stream-like */
|
||||
#define FMODE_STREAM ((__force fmode_t)0x200000)
|
||||
|
||||
+#define FMODE_NOREUSE ((__force fmode_t)0x400000)
|
||||
+
|
||||
/* File was opened by fanotify and shouldn't generate fanotify events */
|
||||
#define FMODE_NONOTIFY ((__force fmode_t)0x4000000)
|
||||
|
||||
--- a/include/linux/mm_inline.h
|
||||
+++ b/include/linux/mm_inline.h
|
||||
@@ -339,6 +339,9 @@ static inline bool vma_has_recency(struc
|
||||
if (vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ))
|
||||
return false;
|
||||
|
||||
+ if (vma->vm_file && (vma->vm_file->f_mode & FMODE_NOREUSE))
|
||||
+ return false;
|
||||
+
|
||||
return true;
|
||||
}
|
||||
|
||||
--- a/mm/fadvise.c
|
||||
+++ b/mm/fadvise.c
|
||||
@@ -80,7 +80,7 @@ int generic_fadvise(struct file *file, l
|
||||
case POSIX_FADV_NORMAL:
|
||||
file->f_ra.ra_pages = bdi->ra_pages;
|
||||
spin_lock(&file->f_lock);
|
||||
- file->f_mode &= ~FMODE_RANDOM;
|
||||
+ file->f_mode &= ~(FMODE_RANDOM | FMODE_NOREUSE);
|
||||
spin_unlock(&file->f_lock);
|
||||
break;
|
||||
case POSIX_FADV_RANDOM:
|
||||
@@ -107,6 +107,9 @@ int generic_fadvise(struct file *file, l
|
||||
force_page_cache_readahead(mapping, file, start_index, nrpages);
|
||||
break;
|
||||
case POSIX_FADV_NOREUSE:
|
||||
+ spin_lock(&file->f_lock);
|
||||
+ file->f_mode |= FMODE_NOREUSE;
|
||||
+ spin_unlock(&file->f_lock);
|
||||
break;
|
||||
case POSIX_FADV_DONTNEED:
|
||||
if (!inode_write_congested(mapping->host))
|
@ -0,0 +1,348 @@
|
||||
From 348fdbada9fb3f0bf1a53651be46319105af187f Mon Sep 17 00:00:00 2001
|
||||
From: Yu Zhao <yuzhao@google.com>
|
||||
Date: Wed, 21 Dec 2022 21:18:59 -0700
|
||||
Subject: [PATCH 21/29] mm: multi-gen LRU: rename lru_gen_struct to
|
||||
lru_gen_page
|
||||
|
||||
Patch series "mm: multi-gen LRU: memcg LRU", v3.
|
||||
|
||||
Overview
|
||||
========
|
||||
|
||||
An memcg LRU is a per-node LRU of memcgs. It is also an LRU of LRUs,
|
||||
since each node and memcg combination has an LRU of pages (see
|
||||
mem_cgroup_lruvec()).
|
||||
|
||||
Its goal is to improve the scalability of global reclaim, which is
|
||||
critical to system-wide memory overcommit in data centers. Note that
|
||||
memcg reclaim is currently out of scope.
|
||||
|
||||
Its memory bloat is a pointer to each lruvec and negligible to each
|
||||
pglist_data. In terms of traversing memcgs during global reclaim, it
|
||||
improves the best-case complexity from O(n) to O(1) and does not affect
|
||||
the worst-case complexity O(n). Therefore, on average, it has a sublinear
|
||||
complexity in contrast to the current linear complexity.
|
||||
|
||||
The basic structure of an memcg LRU can be understood by an analogy to
|
||||
the active/inactive LRU (of pages):
|
||||
1. It has the young and the old (generations), i.e., the counterparts
|
||||
to the active and the inactive;
|
||||
2. The increment of max_seq triggers promotion, i.e., the counterpart
|
||||
to activation;
|
||||
3. Other events trigger similar operations, e.g., offlining an memcg
|
||||
triggers demotion, i.e., the counterpart to deactivation.
|
||||
|
||||
In terms of global reclaim, it has two distinct features:
|
||||
1. Sharding, which allows each thread to start at a random memcg (in
|
||||
the old generation) and improves parallelism;
|
||||
2. Eventual fairness, which allows direct reclaim to bail out at will
|
||||
and reduces latency without affecting fairness over some time.
|
||||
|
||||
The commit message in patch 6 details the workflow:
|
||||
https://lore.kernel.org/r/20221222041905.2431096-7-yuzhao@google.com/
|
||||
|
||||
The following is a simple test to quickly verify its effectiveness.
|
||||
|
||||
Test design:
|
||||
1. Create multiple memcgs.
|
||||
2. Each memcg contains a job (fio).
|
||||
3. All jobs access the same amount of memory randomly.
|
||||
4. The system does not experience global memory pressure.
|
||||
5. Periodically write to the root memory.reclaim.
|
||||
|
||||
Desired outcome:
|
||||
1. All memcgs have similar pgsteal counts, i.e., stddev(pgsteal)
|
||||
over mean(pgsteal) is close to 0%.
|
||||
2. The total pgsteal is close to the total requested through
|
||||
memory.reclaim, i.e., sum(pgsteal) over sum(requested) is close
|
||||
to 100%.
|
||||
|
||||
Actual outcome [1]:
|
||||
MGLRU off MGLRU on
|
||||
stddev(pgsteal) / mean(pgsteal) 75% 20%
|
||||
sum(pgsteal) / sum(requested) 425% 95%
|
||||
|
||||
####################################################################
|
||||
MEMCGS=128
|
||||
|
||||
for ((memcg = 0; memcg < $MEMCGS; memcg++)); do
|
||||
mkdir /sys/fs/cgroup/memcg$memcg
|
||||
done
|
||||
|
||||
start() {
|
||||
echo $BASHPID > /sys/fs/cgroup/memcg$memcg/cgroup.procs
|
||||
|
||||
fio -name=memcg$memcg --numjobs=1 --ioengine=mmap \
|
||||
--filename=/dev/zero --size=1920M --rw=randrw \
|
||||
--rate=64m,64m --random_distribution=random \
|
||||
--fadvise_hint=0 --time_based --runtime=10h \
|
||||
--group_reporting --minimal
|
||||
}
|
||||
|
||||
for ((memcg = 0; memcg < $MEMCGS; memcg++)); do
|
||||
start &
|
||||
done
|
||||
|
||||
sleep 600
|
||||
|
||||
for ((i = 0; i < 600; i++)); do
|
||||
echo 256m >/sys/fs/cgroup/memory.reclaim
|
||||
sleep 6
|
||||
done
|
||||
|
||||
for ((memcg = 0; memcg < $MEMCGS; memcg++)); do
|
||||
grep "pgsteal " /sys/fs/cgroup/memcg$memcg/memory.stat
|
||||
done
|
||||
####################################################################
|
||||
|
||||
[1]: This was obtained from running the above script (touches less
|
||||
than 256GB memory) on an EPYC 7B13 with 512GB DRAM for over an
|
||||
hour.
|
||||
|
||||
This patch (of 8):
|
||||
|
||||
The new name lru_gen_page will be more distinct from the coming
|
||||
lru_gen_memcg.
|
||||
|
||||
Link: https://lkml.kernel.org/r/20221222041905.2431096-1-yuzhao@google.com
|
||||
Link: https://lkml.kernel.org/r/20221222041905.2431096-2-yuzhao@google.com
|
||||
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
||||
Cc: Johannes Weiner <hannes@cmpxchg.org>
|
||||
Cc: Jonathan Corbet <corbet@lwn.net>
|
||||
Cc: Michael Larabel <Michael@MichaelLarabel.com>
|
||||
Cc: Michal Hocko <mhocko@kernel.org>
|
||||
Cc: Mike Rapoport <rppt@kernel.org>
|
||||
Cc: Roman Gushchin <roman.gushchin@linux.dev>
|
||||
Cc: Suren Baghdasaryan <surenb@google.com>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
include/linux/mm_inline.h | 4 ++--
|
||||
include/linux/mmzone.h | 6 +++---
|
||||
mm/vmscan.c | 34 +++++++++++++++++-----------------
|
||||
mm/workingset.c | 4 ++--
|
||||
4 files changed, 24 insertions(+), 24 deletions(-)
|
||||
|
||||
--- a/include/linux/mm_inline.h
|
||||
+++ b/include/linux/mm_inline.h
|
||||
@@ -168,7 +168,7 @@ static inline void lru_gen_update_size(s
|
||||
int zone = page_zonenum(page);
|
||||
int delta = thp_nr_pages(page);
|
||||
enum lru_list lru = type * LRU_INACTIVE_FILE;
|
||||
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
||||
+ struct lru_gen_page *lrugen = &lruvec->lrugen;
|
||||
|
||||
VM_WARN_ON_ONCE(old_gen != -1 && old_gen >= MAX_NR_GENS);
|
||||
VM_WARN_ON_ONCE(new_gen != -1 && new_gen >= MAX_NR_GENS);
|
||||
@@ -214,7 +214,7 @@ static inline bool lru_gen_add_page(stru
|
||||
int gen = page_lru_gen(page);
|
||||
int type = page_is_file_lru(page);
|
||||
int zone = page_zonenum(page);
|
||||
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
||||
+ struct lru_gen_page *lrugen = &lruvec->lrugen;
|
||||
|
||||
VM_WARN_ON_ONCE_PAGE(gen != -1, page);
|
||||
|
||||
--- a/include/linux/mmzone.h
|
||||
+++ b/include/linux/mmzone.h
|
||||
@@ -394,7 +394,7 @@ enum {
|
||||
* The number of pages in each generation is eventually consistent and therefore
|
||||
* can be transiently negative when reset_batch_size() is pending.
|
||||
*/
|
||||
-struct lru_gen_struct {
|
||||
+struct lru_gen_page {
|
||||
/* the aging increments the youngest generation number */
|
||||
unsigned long max_seq;
|
||||
/* the eviction increments the oldest generation numbers */
|
||||
@@ -451,7 +451,7 @@ struct lru_gen_mm_state {
|
||||
struct lru_gen_mm_walk {
|
||||
/* the lruvec under reclaim */
|
||||
struct lruvec *lruvec;
|
||||
- /* unstable max_seq from lru_gen_struct */
|
||||
+ /* unstable max_seq from lru_gen_page */
|
||||
unsigned long max_seq;
|
||||
/* the next address within an mm to scan */
|
||||
unsigned long next_addr;
|
||||
@@ -514,7 +514,7 @@ struct lruvec {
|
||||
unsigned long flags;
|
||||
#ifdef CONFIG_LRU_GEN
|
||||
/* evictable pages divided into generations */
|
||||
- struct lru_gen_struct lrugen;
|
||||
+ struct lru_gen_page lrugen;
|
||||
/* to concurrently iterate lru_gen_mm_list */
|
||||
struct lru_gen_mm_state mm_state;
|
||||
#endif
|
||||
--- a/mm/vmscan.c
|
||||
+++ b/mm/vmscan.c
|
||||
@@ -2910,7 +2910,7 @@ static int get_nr_gens(struct lruvec *lr
|
||||
|
||||
static bool __maybe_unused seq_is_valid(struct lruvec *lruvec)
|
||||
{
|
||||
- /* see the comment on lru_gen_struct */
|
||||
+ /* see the comment on lru_gen_page */
|
||||
return get_nr_gens(lruvec, LRU_GEN_FILE) >= MIN_NR_GENS &&
|
||||
get_nr_gens(lruvec, LRU_GEN_FILE) <= get_nr_gens(lruvec, LRU_GEN_ANON) &&
|
||||
get_nr_gens(lruvec, LRU_GEN_ANON) <= MAX_NR_GENS;
|
||||
@@ -3316,7 +3316,7 @@ struct ctrl_pos {
|
||||
static void read_ctrl_pos(struct lruvec *lruvec, int type, int tier, int gain,
|
||||
struct ctrl_pos *pos)
|
||||
{
|
||||
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
||||
+ struct lru_gen_page *lrugen = &lruvec->lrugen;
|
||||
int hist = lru_hist_from_seq(lrugen->min_seq[type]);
|
||||
|
||||
pos->refaulted = lrugen->avg_refaulted[type][tier] +
|
||||
@@ -3331,7 +3331,7 @@ static void read_ctrl_pos(struct lruvec
|
||||
static void reset_ctrl_pos(struct lruvec *lruvec, int type, bool carryover)
|
||||
{
|
||||
int hist, tier;
|
||||
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
||||
+ struct lru_gen_page *lrugen = &lruvec->lrugen;
|
||||
bool clear = carryover ? NR_HIST_GENS == 1 : NR_HIST_GENS > 1;
|
||||
unsigned long seq = carryover ? lrugen->min_seq[type] : lrugen->max_seq + 1;
|
||||
|
||||
@@ -3408,7 +3408,7 @@ static int page_update_gen(struct page *
|
||||
static int page_inc_gen(struct lruvec *lruvec, struct page *page, bool reclaiming)
|
||||
{
|
||||
int type = page_is_file_lru(page);
|
||||
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
||||
+ struct lru_gen_page *lrugen = &lruvec->lrugen;
|
||||
int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]);
|
||||
unsigned long new_flags, old_flags = READ_ONCE(page->flags);
|
||||
|
||||
@@ -3453,7 +3453,7 @@ static void update_batch_size(struct lru
|
||||
static void reset_batch_size(struct lruvec *lruvec, struct lru_gen_mm_walk *walk)
|
||||
{
|
||||
int gen, type, zone;
|
||||
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
||||
+ struct lru_gen_page *lrugen = &lruvec->lrugen;
|
||||
|
||||
walk->batched = 0;
|
||||
|
||||
@@ -3979,7 +3979,7 @@ static bool inc_min_seq(struct lruvec *l
|
||||
{
|
||||
int zone;
|
||||
int remaining = MAX_LRU_BATCH;
|
||||
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
||||
+ struct lru_gen_page *lrugen = &lruvec->lrugen;
|
||||
int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]);
|
||||
|
||||
if (type == LRU_GEN_ANON && !can_swap)
|
||||
@@ -4015,7 +4015,7 @@ static bool try_to_inc_min_seq(struct lr
|
||||
{
|
||||
int gen, type, zone;
|
||||
bool success = false;
|
||||
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
||||
+ struct lru_gen_page *lrugen = &lruvec->lrugen;
|
||||
DEFINE_MIN_SEQ(lruvec);
|
||||
|
||||
VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
|
||||
@@ -4036,7 +4036,7 @@ next:
|
||||
;
|
||||
}
|
||||
|
||||
- /* see the comment on lru_gen_struct */
|
||||
+ /* see the comment on lru_gen_page */
|
||||
if (can_swap) {
|
||||
min_seq[LRU_GEN_ANON] = min(min_seq[LRU_GEN_ANON], min_seq[LRU_GEN_FILE]);
|
||||
min_seq[LRU_GEN_FILE] = max(min_seq[LRU_GEN_ANON], lrugen->min_seq[LRU_GEN_FILE]);
|
||||
@@ -4058,7 +4058,7 @@ static void inc_max_seq(struct lruvec *l
|
||||
{
|
||||
int prev, next;
|
||||
int type, zone;
|
||||
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
||||
+ struct lru_gen_page *lrugen = &lruvec->lrugen;
|
||||
|
||||
spin_lock_irq(&lruvec->lru_lock);
|
||||
|
||||
@@ -4116,7 +4116,7 @@ static bool try_to_inc_max_seq(struct lr
|
||||
bool success;
|
||||
struct lru_gen_mm_walk *walk;
|
||||
struct mm_struct *mm = NULL;
|
||||
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
||||
+ struct lru_gen_page *lrugen = &lruvec->lrugen;
|
||||
|
||||
VM_WARN_ON_ONCE(max_seq > READ_ONCE(lrugen->max_seq));
|
||||
|
||||
@@ -4181,7 +4181,7 @@ static bool should_run_aging(struct lruv
|
||||
unsigned long old = 0;
|
||||
unsigned long young = 0;
|
||||
unsigned long total = 0;
|
||||
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
||||
+ struct lru_gen_page *lrugen = &lruvec->lrugen;
|
||||
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
|
||||
|
||||
for (type = !can_swap; type < ANON_AND_FILE; type++) {
|
||||
@@ -4466,7 +4466,7 @@ static bool sort_page(struct lruvec *lru
|
||||
int delta = thp_nr_pages(page);
|
||||
int refs = page_lru_refs(page);
|
||||
int tier = lru_tier_from_refs(refs);
|
||||
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
||||
+ struct lru_gen_page *lrugen = &lruvec->lrugen;
|
||||
|
||||
VM_WARN_ON_ONCE_PAGE(gen >= MAX_NR_GENS, page);
|
||||
|
||||
@@ -4566,7 +4566,7 @@ static int scan_pages(struct lruvec *lru
|
||||
int scanned = 0;
|
||||
int isolated = 0;
|
||||
int remaining = MAX_LRU_BATCH;
|
||||
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
||||
+ struct lru_gen_page *lrugen = &lruvec->lrugen;
|
||||
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
|
||||
|
||||
VM_WARN_ON_ONCE(!list_empty(list));
|
||||
@@ -4967,7 +4967,7 @@ done:
|
||||
|
||||
static bool __maybe_unused state_is_valid(struct lruvec *lruvec)
|
||||
{
|
||||
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
||||
+ struct lru_gen_page *lrugen = &lruvec->lrugen;
|
||||
|
||||
if (lrugen->enabled) {
|
||||
enum lru_list lru;
|
||||
@@ -5247,7 +5247,7 @@ static void lru_gen_seq_show_full(struct
|
||||
int i;
|
||||
int type, tier;
|
||||
int hist = lru_hist_from_seq(seq);
|
||||
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
||||
+ struct lru_gen_page *lrugen = &lruvec->lrugen;
|
||||
|
||||
for (tier = 0; tier < MAX_NR_TIERS; tier++) {
|
||||
seq_printf(m, " %10d", tier);
|
||||
@@ -5296,7 +5296,7 @@ static int lru_gen_seq_show(struct seq_f
|
||||
unsigned long seq;
|
||||
bool full = !debugfs_real_fops(m->file)->write;
|
||||
struct lruvec *lruvec = v;
|
||||
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
||||
+ struct lru_gen_page *lrugen = &lruvec->lrugen;
|
||||
int nid = lruvec_pgdat(lruvec)->node_id;
|
||||
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
|
||||
DEFINE_MAX_SEQ(lruvec);
|
||||
@@ -5549,7 +5549,7 @@ void lru_gen_init_lruvec(struct lruvec *
|
||||
{
|
||||
int i;
|
||||
int gen, type, zone;
|
||||
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
||||
+ struct lru_gen_page *lrugen = &lruvec->lrugen;
|
||||
|
||||
lrugen->max_seq = MIN_NR_GENS + 1;
|
||||
lrugen->enabled = lru_gen_enabled();
|
||||
--- a/mm/workingset.c
|
||||
+++ b/mm/workingset.c
|
||||
@@ -223,7 +223,7 @@ static void *lru_gen_eviction(struct pag
|
||||
unsigned long token;
|
||||
unsigned long min_seq;
|
||||
struct lruvec *lruvec;
|
||||
- struct lru_gen_struct *lrugen;
|
||||
+ struct lru_gen_page *lrugen;
|
||||
int type = page_is_file_lru(page);
|
||||
int delta = thp_nr_pages(page);
|
||||
int refs = page_lru_refs(page);
|
||||
@@ -252,7 +252,7 @@ static void lru_gen_refault(struct page
|
||||
unsigned long token;
|
||||
unsigned long min_seq;
|
||||
struct lruvec *lruvec;
|
||||
- struct lru_gen_struct *lrugen;
|
||||
+ struct lru_gen_page *lrugen;
|
||||
struct mem_cgroup *memcg;
|
||||
struct pglist_data *pgdat;
|
||||
int type = page_is_file_lru(page);
|
@ -0,0 +1,162 @@
|
||||
From afd37e73db04c7e6b47411120ac5f6a7eca51fec Mon Sep 17 00:00:00 2001
|
||||
From: Yu Zhao <yuzhao@google.com>
|
||||
Date: Wed, 21 Dec 2022 21:19:00 -0700
|
||||
Subject: [PATCH 22/29] mm: multi-gen LRU: rename lrugen->lists[] to
|
||||
lrugen->pages[]
|
||||
|
||||
lru_gen_page will be chained into per-node lists by the coming
|
||||
lrugen->list.
|
||||
|
||||
Link: https://lkml.kernel.org/r/20221222041905.2431096-3-yuzhao@google.com
|
||||
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
||||
Cc: Johannes Weiner <hannes@cmpxchg.org>
|
||||
Cc: Jonathan Corbet <corbet@lwn.net>
|
||||
Cc: Michael Larabel <Michael@MichaelLarabel.com>
|
||||
Cc: Michal Hocko <mhocko@kernel.org>
|
||||
Cc: Mike Rapoport <rppt@kernel.org>
|
||||
Cc: Roman Gushchin <roman.gushchin@linux.dev>
|
||||
Cc: Suren Baghdasaryan <surenb@google.com>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
include/linux/mm_inline.h | 4 ++--
|
||||
include/linux/mmzone.h | 8 ++++----
|
||||
mm/vmscan.c | 20 ++++++++++----------
|
||||
3 files changed, 16 insertions(+), 16 deletions(-)
|
||||
|
||||
--- a/include/linux/mm_inline.h
|
||||
+++ b/include/linux/mm_inline.h
|
||||
@@ -246,9 +246,9 @@ static inline bool lru_gen_add_page(stru
|
||||
lru_gen_update_size(lruvec, page, -1, gen);
|
||||
/* for rotate_reclaimable_page() */
|
||||
if (reclaiming)
|
||||
- list_add_tail(&page->lru, &lrugen->lists[gen][type][zone]);
|
||||
+ list_add_tail(&page->lru, &lrugen->pages[gen][type][zone]);
|
||||
else
|
||||
- list_add(&page->lru, &lrugen->lists[gen][type][zone]);
|
||||
+ list_add(&page->lru, &lrugen->pages[gen][type][zone]);
|
||||
|
||||
return true;
|
||||
}
|
||||
--- a/include/linux/mmzone.h
|
||||
+++ b/include/linux/mmzone.h
|
||||
@@ -302,7 +302,7 @@ enum lruvec_flags {
|
||||
* They form a sliding window of a variable size [MIN_NR_GENS, MAX_NR_GENS]. An
|
||||
* offset within MAX_NR_GENS, i.e., gen, indexes the LRU list of the
|
||||
* corresponding generation. The gen counter in page->flags stores gen+1 while
|
||||
- * a page is on one of lrugen->lists[]. Otherwise it stores 0.
|
||||
+ * a page is on one of lrugen->pages[]. Otherwise it stores 0.
|
||||
*
|
||||
* A page is added to the youngest generation on faulting. The aging needs to
|
||||
* check the accessed bit at least twice before handing this page over to the
|
||||
@@ -314,8 +314,8 @@ enum lruvec_flags {
|
||||
* rest of generations, if they exist, are considered inactive. See
|
||||
* lru_gen_is_active().
|
||||
*
|
||||
- * PG_active is always cleared while a page is on one of lrugen->lists[] so that
|
||||
- * the aging needs not to worry about it. And it's set again when a page
|
||||
+ * PG_active is always cleared while a page is on one of lrugen->pages[] so
|
||||
+ * that the aging needs not to worry about it. And it's set again when a page
|
||||
* considered active is isolated for non-reclaiming purposes, e.g., migration.
|
||||
* See lru_gen_add_page() and lru_gen_del_page().
|
||||
*
|
||||
@@ -402,7 +402,7 @@ struct lru_gen_page {
|
||||
/* the birth time of each generation in jiffies */
|
||||
unsigned long timestamps[MAX_NR_GENS];
|
||||
/* the multi-gen LRU lists, lazily sorted on eviction */
|
||||
- struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
|
||||
+ struct list_head pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
|
||||
/* the multi-gen LRU sizes, eventually consistent */
|
||||
long nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
|
||||
/* the exponential moving average of refaulted */
|
||||
--- a/mm/vmscan.c
|
||||
+++ b/mm/vmscan.c
|
||||
@@ -3987,7 +3987,7 @@ static bool inc_min_seq(struct lruvec *l
|
||||
|
||||
/* prevent cold/hot inversion if force_scan is true */
|
||||
for (zone = 0; zone < MAX_NR_ZONES; zone++) {
|
||||
- struct list_head *head = &lrugen->lists[old_gen][type][zone];
|
||||
+ struct list_head *head = &lrugen->pages[old_gen][type][zone];
|
||||
|
||||
while (!list_empty(head)) {
|
||||
struct page *page = lru_to_page(head);
|
||||
@@ -3998,7 +3998,7 @@ static bool inc_min_seq(struct lruvec *l
|
||||
VM_WARN_ON_ONCE_PAGE(page_zonenum(page) != zone, page);
|
||||
|
||||
new_gen = page_inc_gen(lruvec, page, false);
|
||||
- list_move_tail(&page->lru, &lrugen->lists[new_gen][type][zone]);
|
||||
+ list_move_tail(&page->lru, &lrugen->pages[new_gen][type][zone]);
|
||||
|
||||
if (!--remaining)
|
||||
return false;
|
||||
@@ -4026,7 +4026,7 @@ static bool try_to_inc_min_seq(struct lr
|
||||
gen = lru_gen_from_seq(min_seq[type]);
|
||||
|
||||
for (zone = 0; zone < MAX_NR_ZONES; zone++) {
|
||||
- if (!list_empty(&lrugen->lists[gen][type][zone]))
|
||||
+ if (!list_empty(&lrugen->pages[gen][type][zone]))
|
||||
goto next;
|
||||
}
|
||||
|
||||
@@ -4491,7 +4491,7 @@ static bool sort_page(struct lruvec *lru
|
||||
|
||||
/* promoted */
|
||||
if (gen != lru_gen_from_seq(lrugen->min_seq[type])) {
|
||||
- list_move(&page->lru, &lrugen->lists[gen][type][zone]);
|
||||
+ list_move(&page->lru, &lrugen->pages[gen][type][zone]);
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -4500,7 +4500,7 @@ static bool sort_page(struct lruvec *lru
|
||||
int hist = lru_hist_from_seq(lrugen->min_seq[type]);
|
||||
|
||||
gen = page_inc_gen(lruvec, page, false);
|
||||
- list_move_tail(&page->lru, &lrugen->lists[gen][type][zone]);
|
||||
+ list_move_tail(&page->lru, &lrugen->pages[gen][type][zone]);
|
||||
|
||||
WRITE_ONCE(lrugen->protected[hist][type][tier - 1],
|
||||
lrugen->protected[hist][type][tier - 1] + delta);
|
||||
@@ -4512,7 +4512,7 @@ static bool sort_page(struct lruvec *lru
|
||||
if (PageLocked(page) || PageWriteback(page) ||
|
||||
(type == LRU_GEN_FILE && PageDirty(page))) {
|
||||
gen = page_inc_gen(lruvec, page, true);
|
||||
- list_move(&page->lru, &lrugen->lists[gen][type][zone]);
|
||||
+ list_move(&page->lru, &lrugen->pages[gen][type][zone]);
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -4579,7 +4579,7 @@ static int scan_pages(struct lruvec *lru
|
||||
for (zone = sc->reclaim_idx; zone >= 0; zone--) {
|
||||
LIST_HEAD(moved);
|
||||
int skipped = 0;
|
||||
- struct list_head *head = &lrugen->lists[gen][type][zone];
|
||||
+ struct list_head *head = &lrugen->pages[gen][type][zone];
|
||||
|
||||
while (!list_empty(head)) {
|
||||
struct page *page = lru_to_page(head);
|
||||
@@ -4980,7 +4980,7 @@ static bool __maybe_unused state_is_vali
|
||||
int gen, type, zone;
|
||||
|
||||
for_each_gen_type_zone(gen, type, zone) {
|
||||
- if (!list_empty(&lrugen->lists[gen][type][zone]))
|
||||
+ if (!list_empty(&lrugen->pages[gen][type][zone]))
|
||||
return false;
|
||||
}
|
||||
}
|
||||
@@ -5025,7 +5025,7 @@ static bool drain_evictable(struct lruve
|
||||
int remaining = MAX_LRU_BATCH;
|
||||
|
||||
for_each_gen_type_zone(gen, type, zone) {
|
||||
- struct list_head *head = &lruvec->lrugen.lists[gen][type][zone];
|
||||
+ struct list_head *head = &lruvec->lrugen.pages[gen][type][zone];
|
||||
|
||||
while (!list_empty(head)) {
|
||||
bool success;
|
||||
@@ -5558,7 +5558,7 @@ void lru_gen_init_lruvec(struct lruvec *
|
||||
lrugen->timestamps[i] = jiffies;
|
||||
|
||||
for_each_gen_type_zone(gen, type, zone)
|
||||
- INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]);
|
||||
+ INIT_LIST_HEAD(&lrugen->pages[gen][type][zone]);
|
||||
|
||||
lruvec->mm_state.seq = MIN_NR_GENS;
|
||||
init_waitqueue_head(&lruvec->mm_state.wait);
|
@ -0,0 +1,188 @@
|
||||
From ce45f1c4b32cf69b166f56ef5bc6c761e06ed4e5 Mon Sep 17 00:00:00 2001
|
||||
From: Yu Zhao <yuzhao@google.com>
|
||||
Date: Wed, 21 Dec 2022 21:19:01 -0700
|
||||
Subject: [PATCH 23/29] mm: multi-gen LRU: remove eviction fairness safeguard
|
||||
|
||||
Recall that the eviction consumes the oldest generation: first it
|
||||
bucket-sorts pages whose gen counters were updated by the aging and
|
||||
reclaims the rest; then it increments lrugen->min_seq.
|
||||
|
||||
The current eviction fairness safeguard for global reclaim has a
|
||||
dilemma: when there are multiple eligible memcgs, should it continue
|
||||
or stop upon meeting the reclaim goal? If it continues, it overshoots
|
||||
and increases direct reclaim latency; if it stops, it loses fairness
|
||||
between memcgs it has taken memory away from and those it has yet to.
|
||||
|
||||
With memcg LRU, the eviction, while ensuring eventual fairness, will
|
||||
stop upon meeting its goal. Therefore the current eviction fairness
|
||||
safeguard for global reclaim will not be needed.
|
||||
|
||||
Note that memcg LRU only applies to global reclaim. For memcg reclaim,
|
||||
the eviction will continue, even if it is overshooting. This becomes
|
||||
unconditional due to code simplification.
|
||||
|
||||
Link: https://lkml.kernel.org/r/20221222041905.2431096-4-yuzhao@google.com
|
||||
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
||||
Cc: Johannes Weiner <hannes@cmpxchg.org>
|
||||
Cc: Jonathan Corbet <corbet@lwn.net>
|
||||
Cc: Michael Larabel <Michael@MichaelLarabel.com>
|
||||
Cc: Michal Hocko <mhocko@kernel.org>
|
||||
Cc: Mike Rapoport <rppt@kernel.org>
|
||||
Cc: Roman Gushchin <roman.gushchin@linux.dev>
|
||||
Cc: Suren Baghdasaryan <surenb@google.com>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
mm/vmscan.c | 82 +++++++++++++++--------------------------------------
|
||||
1 file changed, 23 insertions(+), 59 deletions(-)
|
||||
|
||||
--- a/mm/vmscan.c
|
||||
+++ b/mm/vmscan.c
|
||||
@@ -443,6 +443,11 @@ static bool cgroup_reclaim(struct scan_c
|
||||
return sc->target_mem_cgroup;
|
||||
}
|
||||
|
||||
+static bool global_reclaim(struct scan_control *sc)
|
||||
+{
|
||||
+ return !sc->target_mem_cgroup || mem_cgroup_is_root(sc->target_mem_cgroup);
|
||||
+}
|
||||
+
|
||||
/**
|
||||
* writeback_throttling_sane - is the usual dirty throttling mechanism available?
|
||||
* @sc: scan_control in question
|
||||
@@ -493,6 +498,11 @@ static bool cgroup_reclaim(struct scan_c
|
||||
return false;
|
||||
}
|
||||
|
||||
+static bool global_reclaim(struct scan_control *sc)
|
||||
+{
|
||||
+ return true;
|
||||
+}
|
||||
+
|
||||
static bool writeback_throttling_sane(struct scan_control *sc)
|
||||
{
|
||||
return true;
|
||||
@@ -4722,8 +4732,7 @@ static int isolate_pages(struct lruvec *
|
||||
return scanned;
|
||||
}
|
||||
|
||||
-static int evict_pages(struct lruvec *lruvec, struct scan_control *sc, int swappiness,
|
||||
- bool *need_swapping)
|
||||
+static int evict_pages(struct lruvec *lruvec, struct scan_control *sc, int swappiness)
|
||||
{
|
||||
int type;
|
||||
int scanned;
|
||||
@@ -4812,9 +4821,6 @@ retry:
|
||||
goto retry;
|
||||
}
|
||||
|
||||
- if (need_swapping && type == LRU_GEN_ANON)
|
||||
- *need_swapping = true;
|
||||
-
|
||||
return scanned;
|
||||
}
|
||||
|
||||
@@ -4853,68 +4859,26 @@ done:
|
||||
return min_seq[!can_swap] + MIN_NR_GENS <= max_seq ? nr_to_scan : 0;
|
||||
}
|
||||
|
||||
-static bool should_abort_scan(struct lruvec *lruvec, unsigned long seq,
|
||||
- struct scan_control *sc, bool need_swapping)
|
||||
+static unsigned long get_nr_to_reclaim(struct scan_control *sc)
|
||||
{
|
||||
- int i;
|
||||
- DEFINE_MAX_SEQ(lruvec);
|
||||
-
|
||||
- if (!current_is_kswapd()) {
|
||||
- /* age each memcg once to ensure fairness */
|
||||
- if (max_seq - seq > 1)
|
||||
- return true;
|
||||
-
|
||||
- /* over-swapping can increase allocation latency */
|
||||
- if (sc->nr_reclaimed >= sc->nr_to_reclaim && need_swapping)
|
||||
- return true;
|
||||
-
|
||||
- /* give this thread a chance to exit and free its memory */
|
||||
- if (fatal_signal_pending(current)) {
|
||||
- sc->nr_reclaimed += MIN_LRU_BATCH;
|
||||
- return true;
|
||||
- }
|
||||
-
|
||||
- if (cgroup_reclaim(sc))
|
||||
- return false;
|
||||
- } else if (sc->nr_reclaimed - sc->last_reclaimed < sc->nr_to_reclaim)
|
||||
- return false;
|
||||
-
|
||||
- /* keep scanning at low priorities to ensure fairness */
|
||||
- if (sc->priority > DEF_PRIORITY - 2)
|
||||
- return false;
|
||||
-
|
||||
- /*
|
||||
- * A minimum amount of work was done under global memory pressure. For
|
||||
- * kswapd, it may be overshooting. For direct reclaim, the target isn't
|
||||
- * met, and yet the allocation may still succeed, since kswapd may have
|
||||
- * caught up. In either case, it's better to stop now, and restart if
|
||||
- * necessary.
|
||||
- */
|
||||
- for (i = 0; i <= sc->reclaim_idx; i++) {
|
||||
- unsigned long wmark;
|
||||
- struct zone *zone = lruvec_pgdat(lruvec)->node_zones + i;
|
||||
-
|
||||
- if (!managed_zone(zone))
|
||||
- continue;
|
||||
-
|
||||
- wmark = current_is_kswapd() ? high_wmark_pages(zone) : low_wmark_pages(zone);
|
||||
- if (wmark > zone_page_state(zone, NR_FREE_PAGES))
|
||||
- return false;
|
||||
- }
|
||||
+ /* don't abort memcg reclaim to ensure fairness */
|
||||
+ if (!global_reclaim(sc))
|
||||
+ return -1;
|
||||
|
||||
- sc->nr_reclaimed += MIN_LRU_BATCH;
|
||||
+ /* discount the previous progress for kswapd */
|
||||
+ if (current_is_kswapd())
|
||||
+ return sc->nr_to_reclaim + sc->last_reclaimed;
|
||||
|
||||
- return true;
|
||||
+ return max(sc->nr_to_reclaim, compact_gap(sc->order));
|
||||
}
|
||||
|
||||
static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
|
||||
{
|
||||
struct blk_plug plug;
|
||||
bool need_aging = false;
|
||||
- bool need_swapping = false;
|
||||
unsigned long scanned = 0;
|
||||
unsigned long reclaimed = sc->nr_reclaimed;
|
||||
- DEFINE_MAX_SEQ(lruvec);
|
||||
+ unsigned long nr_to_reclaim = get_nr_to_reclaim(sc);
|
||||
|
||||
lru_add_drain();
|
||||
|
||||
@@ -4938,7 +4902,7 @@ static void lru_gen_shrink_lruvec(struct
|
||||
if (!nr_to_scan)
|
||||
goto done;
|
||||
|
||||
- delta = evict_pages(lruvec, sc, swappiness, &need_swapping);
|
||||
+ delta = evict_pages(lruvec, sc, swappiness);
|
||||
if (!delta)
|
||||
goto done;
|
||||
|
||||
@@ -4946,7 +4910,7 @@ static void lru_gen_shrink_lruvec(struct
|
||||
if (scanned >= nr_to_scan)
|
||||
break;
|
||||
|
||||
- if (should_abort_scan(lruvec, max_seq, sc, need_swapping))
|
||||
+ if (sc->nr_reclaimed >= nr_to_reclaim)
|
||||
break;
|
||||
|
||||
cond_resched();
|
||||
@@ -5393,7 +5357,7 @@ static int run_eviction(struct lruvec *l
|
||||
if (sc->nr_reclaimed >= nr_to_reclaim)
|
||||
return 0;
|
||||
|
||||
- if (!evict_pages(lruvec, sc, swappiness, NULL))
|
||||
+ if (!evict_pages(lruvec, sc, swappiness))
|
||||
return 0;
|
||||
|
||||
cond_resched();
|
@ -0,0 +1,287 @@
|
||||
From e20b7386fccc18c791796eb1dc1a91eee3ccf801 Mon Sep 17 00:00:00 2001
|
||||
From: Yu Zhao <yuzhao@google.com>
|
||||
Date: Wed, 21 Dec 2022 21:19:02 -0700
|
||||
Subject: [PATCH 24/29] mm: multi-gen LRU: remove aging fairness safeguard
|
||||
|
||||
Recall that the aging produces the youngest generation: first it scans
|
||||
for accessed pages and updates their gen counters; then it increments
|
||||
lrugen->max_seq.
|
||||
|
||||
The current aging fairness safeguard for kswapd uses two passes to
|
||||
ensure the fairness to multiple eligible memcgs. On the first pass,
|
||||
which is shared with the eviction, it checks whether all eligible
|
||||
memcgs are low on cold pages. If so, it requires a second pass, on
|
||||
which it ages all those memcgs at the same time.
|
||||
|
||||
With memcg LRU, the aging, while ensuring eventual fairness, will run
|
||||
when necessary. Therefore the current aging fairness safeguard for
|
||||
kswapd will not be needed.
|
||||
|
||||
Note that memcg LRU only applies to global reclaim. For memcg reclaim,
|
||||
the aging can be unfair to different memcgs, i.e., their
|
||||
lrugen->max_seq can be incremented at different paces.
|
||||
|
||||
Link: https://lkml.kernel.org/r/20221222041905.2431096-5-yuzhao@google.com
|
||||
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
||||
Cc: Johannes Weiner <hannes@cmpxchg.org>
|
||||
Cc: Jonathan Corbet <corbet@lwn.net>
|
||||
Cc: Michael Larabel <Michael@MichaelLarabel.com>
|
||||
Cc: Michal Hocko <mhocko@kernel.org>
|
||||
Cc: Mike Rapoport <rppt@kernel.org>
|
||||
Cc: Roman Gushchin <roman.gushchin@linux.dev>
|
||||
Cc: Suren Baghdasaryan <surenb@google.com>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
mm/vmscan.c | 126 ++++++++++++++++++++++++----------------------------
|
||||
1 file changed, 59 insertions(+), 67 deletions(-)
|
||||
|
||||
--- a/mm/vmscan.c
|
||||
+++ b/mm/vmscan.c
|
||||
@@ -131,7 +131,6 @@ struct scan_control {
|
||||
|
||||
#ifdef CONFIG_LRU_GEN
|
||||
/* help kswapd make better choices among multiple memcgs */
|
||||
- unsigned int memcgs_need_aging:1;
|
||||
unsigned long last_reclaimed;
|
||||
#endif
|
||||
|
||||
@@ -4184,7 +4183,7 @@ done:
|
||||
return true;
|
||||
}
|
||||
|
||||
-static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, unsigned long *min_seq,
|
||||
+static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq,
|
||||
struct scan_control *sc, bool can_swap, unsigned long *nr_to_scan)
|
||||
{
|
||||
int gen, type, zone;
|
||||
@@ -4193,6 +4192,13 @@ static bool should_run_aging(struct lruv
|
||||
unsigned long total = 0;
|
||||
struct lru_gen_page *lrugen = &lruvec->lrugen;
|
||||
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
|
||||
+ DEFINE_MIN_SEQ(lruvec);
|
||||
+
|
||||
+ /* whether this lruvec is completely out of cold pages */
|
||||
+ if (min_seq[!can_swap] + MIN_NR_GENS > max_seq) {
|
||||
+ *nr_to_scan = 0;
|
||||
+ return true;
|
||||
+ }
|
||||
|
||||
for (type = !can_swap; type < ANON_AND_FILE; type++) {
|
||||
unsigned long seq;
|
||||
@@ -4221,8 +4227,6 @@ static bool should_run_aging(struct lruv
|
||||
* stalls when the number of generations reaches MIN_NR_GENS. Hence, the
|
||||
* ideal number of generations is MIN_NR_GENS+1.
|
||||
*/
|
||||
- if (min_seq[!can_swap] + MIN_NR_GENS > max_seq)
|
||||
- return true;
|
||||
if (min_seq[!can_swap] + MIN_NR_GENS < max_seq)
|
||||
return false;
|
||||
|
||||
@@ -4241,40 +4245,54 @@ static bool should_run_aging(struct lruv
|
||||
return false;
|
||||
}
|
||||
|
||||
-static bool age_lruvec(struct lruvec *lruvec, struct scan_control *sc, unsigned long min_ttl)
|
||||
+static bool lruvec_is_sizable(struct lruvec *lruvec, struct scan_control *sc)
|
||||
{
|
||||
- bool need_aging;
|
||||
- unsigned long nr_to_scan;
|
||||
- int swappiness = get_swappiness(lruvec, sc);
|
||||
+ int gen, type, zone;
|
||||
+ unsigned long total = 0;
|
||||
+ bool can_swap = get_swappiness(lruvec, sc);
|
||||
+ struct lru_gen_page *lrugen = &lruvec->lrugen;
|
||||
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
|
||||
DEFINE_MAX_SEQ(lruvec);
|
||||
DEFINE_MIN_SEQ(lruvec);
|
||||
|
||||
- VM_WARN_ON_ONCE(sc->memcg_low_reclaim);
|
||||
+ for (type = !can_swap; type < ANON_AND_FILE; type++) {
|
||||
+ unsigned long seq;
|
||||
|
||||
- mem_cgroup_calculate_protection(NULL, memcg);
|
||||
+ for (seq = min_seq[type]; seq <= max_seq; seq++) {
|
||||
+ gen = lru_gen_from_seq(seq);
|
||||
|
||||
- if (mem_cgroup_below_min(memcg))
|
||||
- return false;
|
||||
+ for (zone = 0; zone < MAX_NR_ZONES; zone++)
|
||||
+ total += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L);
|
||||
+ }
|
||||
+ }
|
||||
|
||||
- need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, swappiness, &nr_to_scan);
|
||||
+ /* whether the size is big enough to be helpful */
|
||||
+ return mem_cgroup_online(memcg) ? (total >> sc->priority) : total;
|
||||
+}
|
||||
|
||||
- if (min_ttl) {
|
||||
- int gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]);
|
||||
- unsigned long birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);
|
||||
+static bool lruvec_is_reclaimable(struct lruvec *lruvec, struct scan_control *sc,
|
||||
+ unsigned long min_ttl)
|
||||
+{
|
||||
+ int gen;
|
||||
+ unsigned long birth;
|
||||
+ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
|
||||
+ DEFINE_MIN_SEQ(lruvec);
|
||||
|
||||
- if (time_is_after_jiffies(birth + min_ttl))
|
||||
- return false;
|
||||
+ VM_WARN_ON_ONCE(sc->memcg_low_reclaim);
|
||||
|
||||
- /* the size is likely too small to be helpful */
|
||||
- if (!nr_to_scan && sc->priority != DEF_PRIORITY)
|
||||
- return false;
|
||||
- }
|
||||
+ /* see the comment on lru_gen_page */
|
||||
+ gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]);
|
||||
+ birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);
|
||||
|
||||
- if (need_aging)
|
||||
- try_to_inc_max_seq(lruvec, max_seq, sc, swappiness, false);
|
||||
+ if (time_is_after_jiffies(birth + min_ttl))
|
||||
+ return false;
|
||||
|
||||
- return true;
|
||||
+ if (!lruvec_is_sizable(lruvec, sc))
|
||||
+ return false;
|
||||
+
|
||||
+ mem_cgroup_calculate_protection(NULL, memcg);
|
||||
+
|
||||
+ return !mem_cgroup_below_min(memcg);
|
||||
}
|
||||
|
||||
/* to protect the working set of the last N jiffies */
|
||||
@@ -4283,46 +4301,32 @@ static unsigned long lru_gen_min_ttl __r
|
||||
static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
|
||||
{
|
||||
struct mem_cgroup *memcg;
|
||||
- bool success = false;
|
||||
unsigned long min_ttl = READ_ONCE(lru_gen_min_ttl);
|
||||
|
||||
VM_WARN_ON_ONCE(!current_is_kswapd());
|
||||
|
||||
sc->last_reclaimed = sc->nr_reclaimed;
|
||||
|
||||
- /*
|
||||
- * To reduce the chance of going into the aging path, which can be
|
||||
- * costly, optimistically skip it if the flag below was cleared in the
|
||||
- * eviction path. This improves the overall performance when multiple
|
||||
- * memcgs are available.
|
||||
- */
|
||||
- if (!sc->memcgs_need_aging) {
|
||||
- sc->memcgs_need_aging = true;
|
||||
+ /* check the order to exclude compaction-induced reclaim */
|
||||
+ if (!min_ttl || sc->order || sc->priority == DEF_PRIORITY)
|
||||
return;
|
||||
- }
|
||||
-
|
||||
- set_mm_walk(pgdat);
|
||||
|
||||
memcg = mem_cgroup_iter(NULL, NULL, NULL);
|
||||
do {
|
||||
struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
|
||||
|
||||
- if (age_lruvec(lruvec, sc, min_ttl))
|
||||
- success = true;
|
||||
+ if (lruvec_is_reclaimable(lruvec, sc, min_ttl)) {
|
||||
+ mem_cgroup_iter_break(NULL, memcg);
|
||||
+ return;
|
||||
+ }
|
||||
|
||||
cond_resched();
|
||||
} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
|
||||
|
||||
- clear_mm_walk();
|
||||
-
|
||||
- /* check the order to exclude compaction-induced reclaim */
|
||||
- if (success || !min_ttl || sc->order)
|
||||
- return;
|
||||
-
|
||||
/*
|
||||
* The main goal is to OOM kill if every generation from all memcgs is
|
||||
* younger than min_ttl. However, another possibility is all memcgs are
|
||||
- * either below min or empty.
|
||||
+ * either too small or below min.
|
||||
*/
|
||||
if (mutex_trylock(&oom_lock)) {
|
||||
struct oom_control oc = {
|
||||
@@ -4830,33 +4834,27 @@ retry:
|
||||
* reclaim.
|
||||
*/
|
||||
static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc,
|
||||
- bool can_swap, bool *need_aging)
|
||||
+ bool can_swap)
|
||||
{
|
||||
unsigned long nr_to_scan;
|
||||
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
|
||||
DEFINE_MAX_SEQ(lruvec);
|
||||
- DEFINE_MIN_SEQ(lruvec);
|
||||
|
||||
if (mem_cgroup_below_min(memcg) ||
|
||||
(mem_cgroup_below_low(memcg) && !sc->memcg_low_reclaim))
|
||||
return 0;
|
||||
|
||||
- *need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, can_swap, &nr_to_scan);
|
||||
- if (!*need_aging)
|
||||
+ if (!should_run_aging(lruvec, max_seq, sc, can_swap, &nr_to_scan))
|
||||
return nr_to_scan;
|
||||
|
||||
/* skip the aging path at the default priority */
|
||||
if (sc->priority == DEF_PRIORITY)
|
||||
- goto done;
|
||||
+ return nr_to_scan;
|
||||
|
||||
- /* leave the work to lru_gen_age_node() */
|
||||
- if (current_is_kswapd())
|
||||
- return 0;
|
||||
+ try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false);
|
||||
|
||||
- if (try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false))
|
||||
- return nr_to_scan;
|
||||
-done:
|
||||
- return min_seq[!can_swap] + MIN_NR_GENS <= max_seq ? nr_to_scan : 0;
|
||||
+ /* skip this lruvec as it's low on cold pages */
|
||||
+ return 0;
|
||||
}
|
||||
|
||||
static unsigned long get_nr_to_reclaim(struct scan_control *sc)
|
||||
@@ -4875,9 +4873,7 @@ static unsigned long get_nr_to_reclaim(s
|
||||
static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
|
||||
{
|
||||
struct blk_plug plug;
|
||||
- bool need_aging = false;
|
||||
unsigned long scanned = 0;
|
||||
- unsigned long reclaimed = sc->nr_reclaimed;
|
||||
unsigned long nr_to_reclaim = get_nr_to_reclaim(sc);
|
||||
|
||||
lru_add_drain();
|
||||
@@ -4898,13 +4894,13 @@ static void lru_gen_shrink_lruvec(struct
|
||||
else
|
||||
swappiness = 0;
|
||||
|
||||
- nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness, &need_aging);
|
||||
+ nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness);
|
||||
if (!nr_to_scan)
|
||||
- goto done;
|
||||
+ break;
|
||||
|
||||
delta = evict_pages(lruvec, sc, swappiness);
|
||||
if (!delta)
|
||||
- goto done;
|
||||
+ break;
|
||||
|
||||
scanned += delta;
|
||||
if (scanned >= nr_to_scan)
|
||||
@@ -4916,10 +4912,6 @@ static void lru_gen_shrink_lruvec(struct
|
||||
cond_resched();
|
||||
}
|
||||
|
||||
- /* see the comment in lru_gen_age_node() */
|
||||
- if (sc->nr_reclaimed - reclaimed >= MIN_LRU_BATCH && !need_aging)
|
||||
- sc->memcgs_need_aging = false;
|
||||
-done:
|
||||
clear_mm_walk();
|
||||
|
||||
blk_finish_plug(&plug);
|
@ -0,0 +1,161 @@
|
||||
From 107d54931df3c28d81648122e219bf0034ef4e99 Mon Sep 17 00:00:00 2001
|
||||
From: Yu Zhao <yuzhao@google.com>
|
||||
Date: Wed, 21 Dec 2022 21:19:03 -0700
|
||||
Subject: [PATCH 25/29] mm: multi-gen LRU: shuffle should_run_aging()
|
||||
|
||||
Move should_run_aging() next to its only caller left.
|
||||
|
||||
Link: https://lkml.kernel.org/r/20221222041905.2431096-6-yuzhao@google.com
|
||||
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
||||
Cc: Johannes Weiner <hannes@cmpxchg.org>
|
||||
Cc: Jonathan Corbet <corbet@lwn.net>
|
||||
Cc: Michael Larabel <Michael@MichaelLarabel.com>
|
||||
Cc: Michal Hocko <mhocko@kernel.org>
|
||||
Cc: Mike Rapoport <rppt@kernel.org>
|
||||
Cc: Roman Gushchin <roman.gushchin@linux.dev>
|
||||
Cc: Suren Baghdasaryan <surenb@google.com>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
mm/vmscan.c | 124 ++++++++++++++++++++++++++--------------------------
|
||||
1 file changed, 62 insertions(+), 62 deletions(-)
|
||||
|
||||
--- a/mm/vmscan.c
|
||||
+++ b/mm/vmscan.c
|
||||
@@ -4183,68 +4183,6 @@ done:
|
||||
return true;
|
||||
}
|
||||
|
||||
-static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq,
|
||||
- struct scan_control *sc, bool can_swap, unsigned long *nr_to_scan)
|
||||
-{
|
||||
- int gen, type, zone;
|
||||
- unsigned long old = 0;
|
||||
- unsigned long young = 0;
|
||||
- unsigned long total = 0;
|
||||
- struct lru_gen_page *lrugen = &lruvec->lrugen;
|
||||
- struct mem_cgroup *memcg = lruvec_memcg(lruvec);
|
||||
- DEFINE_MIN_SEQ(lruvec);
|
||||
-
|
||||
- /* whether this lruvec is completely out of cold pages */
|
||||
- if (min_seq[!can_swap] + MIN_NR_GENS > max_seq) {
|
||||
- *nr_to_scan = 0;
|
||||
- return true;
|
||||
- }
|
||||
-
|
||||
- for (type = !can_swap; type < ANON_AND_FILE; type++) {
|
||||
- unsigned long seq;
|
||||
-
|
||||
- for (seq = min_seq[type]; seq <= max_seq; seq++) {
|
||||
- unsigned long size = 0;
|
||||
-
|
||||
- gen = lru_gen_from_seq(seq);
|
||||
-
|
||||
- for (zone = 0; zone < MAX_NR_ZONES; zone++)
|
||||
- size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L);
|
||||
-
|
||||
- total += size;
|
||||
- if (seq == max_seq)
|
||||
- young += size;
|
||||
- else if (seq + MIN_NR_GENS == max_seq)
|
||||
- old += size;
|
||||
- }
|
||||
- }
|
||||
-
|
||||
- /* try to scrape all its memory if this memcg was deleted */
|
||||
- *nr_to_scan = mem_cgroup_online(memcg) ? (total >> sc->priority) : total;
|
||||
-
|
||||
- /*
|
||||
- * The aging tries to be lazy to reduce the overhead, while the eviction
|
||||
- * stalls when the number of generations reaches MIN_NR_GENS. Hence, the
|
||||
- * ideal number of generations is MIN_NR_GENS+1.
|
||||
- */
|
||||
- if (min_seq[!can_swap] + MIN_NR_GENS < max_seq)
|
||||
- return false;
|
||||
-
|
||||
- /*
|
||||
- * It's also ideal to spread pages out evenly, i.e., 1/(MIN_NR_GENS+1)
|
||||
- * of the total number of pages for each generation. A reasonable range
|
||||
- * for this average portion is [1/MIN_NR_GENS, 1/(MIN_NR_GENS+2)]. The
|
||||
- * aging cares about the upper bound of hot pages, while the eviction
|
||||
- * cares about the lower bound of cold pages.
|
||||
- */
|
||||
- if (young * MIN_NR_GENS > total)
|
||||
- return true;
|
||||
- if (old * (MIN_NR_GENS + 2) < total)
|
||||
- return true;
|
||||
-
|
||||
- return false;
|
||||
-}
|
||||
-
|
||||
static bool lruvec_is_sizable(struct lruvec *lruvec, struct scan_control *sc)
|
||||
{
|
||||
int gen, type, zone;
|
||||
@@ -4828,6 +4766,68 @@ retry:
|
||||
return scanned;
|
||||
}
|
||||
|
||||
+static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq,
|
||||
+ struct scan_control *sc, bool can_swap, unsigned long *nr_to_scan)
|
||||
+{
|
||||
+ int gen, type, zone;
|
||||
+ unsigned long old = 0;
|
||||
+ unsigned long young = 0;
|
||||
+ unsigned long total = 0;
|
||||
+ struct lru_gen_page *lrugen = &lruvec->lrugen;
|
||||
+ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
|
||||
+ DEFINE_MIN_SEQ(lruvec);
|
||||
+
|
||||
+ /* whether this lruvec is completely out of cold pages */
|
||||
+ if (min_seq[!can_swap] + MIN_NR_GENS > max_seq) {
|
||||
+ *nr_to_scan = 0;
|
||||
+ return true;
|
||||
+ }
|
||||
+
|
||||
+ for (type = !can_swap; type < ANON_AND_FILE; type++) {
|
||||
+ unsigned long seq;
|
||||
+
|
||||
+ for (seq = min_seq[type]; seq <= max_seq; seq++) {
|
||||
+ unsigned long size = 0;
|
||||
+
|
||||
+ gen = lru_gen_from_seq(seq);
|
||||
+
|
||||
+ for (zone = 0; zone < MAX_NR_ZONES; zone++)
|
||||
+ size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L);
|
||||
+
|
||||
+ total += size;
|
||||
+ if (seq == max_seq)
|
||||
+ young += size;
|
||||
+ else if (seq + MIN_NR_GENS == max_seq)
|
||||
+ old += size;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ /* try to scrape all its memory if this memcg was deleted */
|
||||
+ *nr_to_scan = mem_cgroup_online(memcg) ? (total >> sc->priority) : total;
|
||||
+
|
||||
+ /*
|
||||
+ * The aging tries to be lazy to reduce the overhead, while the eviction
|
||||
+ * stalls when the number of generations reaches MIN_NR_GENS. Hence, the
|
||||
+ * ideal number of generations is MIN_NR_GENS+1.
|
||||
+ */
|
||||
+ if (min_seq[!can_swap] + MIN_NR_GENS < max_seq)
|
||||
+ return false;
|
||||
+
|
||||
+ /*
|
||||
+ * It's also ideal to spread pages out evenly, i.e., 1/(MIN_NR_GENS+1)
|
||||
+ * of the total number of pages for each generation. A reasonable range
|
||||
+ * for this average portion is [1/MIN_NR_GENS, 1/(MIN_NR_GENS+2)]. The
|
||||
+ * aging cares about the upper bound of hot pages, while the eviction
|
||||
+ * cares about the lower bound of cold pages.
|
||||
+ */
|
||||
+ if (young * MIN_NR_GENS > total)
|
||||
+ return true;
|
||||
+ if (old * (MIN_NR_GENS + 2) < total)
|
||||
+ return true;
|
||||
+
|
||||
+ return false;
|
||||
+}
|
||||
+
|
||||
/*
|
||||
* For future optimizations:
|
||||
* 1. Defer try_to_inc_max_seq() to workqueues to reduce latency for memcg
|
@ -0,0 +1,868 @@
|
||||
From fa6363828d314e837c5f79e97ea5e8c0d2f7f062 Mon Sep 17 00:00:00 2001
|
||||
From: Yu Zhao <yuzhao@google.com>
|
||||
Date: Wed, 21 Dec 2022 21:19:04 -0700
|
||||
Subject: [PATCH 26/29] mm: multi-gen LRU: per-node lru_gen_page lists
|
||||
|
||||
For each node, memcgs are divided into two generations: the old and
|
||||
the young. For each generation, memcgs are randomly sharded into
|
||||
multiple bins to improve scalability. For each bin, an RCU hlist_nulls
|
||||
is virtually divided into three segments: the head, the tail and the
|
||||
default.
|
||||
|
||||
An onlining memcg is added to the tail of a random bin in the old
|
||||
generation. The eviction starts at the head of a random bin in the old
|
||||
generation. The per-node memcg generation counter, whose reminder (mod
|
||||
2) indexes the old generation, is incremented when all its bins become
|
||||
empty.
|
||||
|
||||
There are four operations:
|
||||
1. MEMCG_LRU_HEAD, which moves an memcg to the head of a random bin in
|
||||
its current generation (old or young) and updates its "seg" to
|
||||
"head";
|
||||
2. MEMCG_LRU_TAIL, which moves an memcg to the tail of a random bin in
|
||||
its current generation (old or young) and updates its "seg" to
|
||||
"tail";
|
||||
3. MEMCG_LRU_OLD, which moves an memcg to the head of a random bin in
|
||||
the old generation, updates its "gen" to "old" and resets its "seg"
|
||||
to "default";
|
||||
4. MEMCG_LRU_YOUNG, which moves an memcg to the tail of a random bin
|
||||
in the young generation, updates its "gen" to "young" and resets
|
||||
its "seg" to "default".
|
||||
|
||||
The events that trigger the above operations are:
|
||||
1. Exceeding the soft limit, which triggers MEMCG_LRU_HEAD;
|
||||
2. The first attempt to reclaim an memcg below low, which triggers
|
||||
MEMCG_LRU_TAIL;
|
||||
3. The first attempt to reclaim an memcg below reclaimable size
|
||||
threshold, which triggers MEMCG_LRU_TAIL;
|
||||
4. The second attempt to reclaim an memcg below reclaimable size
|
||||
threshold, which triggers MEMCG_LRU_YOUNG;
|
||||
5. Attempting to reclaim an memcg below min, which triggers
|
||||
MEMCG_LRU_YOUNG;
|
||||
6. Finishing the aging on the eviction path, which triggers
|
||||
MEMCG_LRU_YOUNG;
|
||||
7. Offlining an memcg, which triggers MEMCG_LRU_OLD.
|
||||
|
||||
Note that memcg LRU only applies to global reclaim, and the
|
||||
round-robin incrementing of their max_seq counters ensures the
|
||||
eventual fairness to all eligible memcgs. For memcg reclaim, it still
|
||||
relies on mem_cgroup_iter().
|
||||
|
||||
Link: https://lkml.kernel.org/r/20221222041905.2431096-7-yuzhao@google.com
|
||||
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
||||
Cc: Johannes Weiner <hannes@cmpxchg.org>
|
||||
Cc: Jonathan Corbet <corbet@lwn.net>
|
||||
Cc: Michael Larabel <Michael@MichaelLarabel.com>
|
||||
Cc: Michal Hocko <mhocko@kernel.org>
|
||||
Cc: Mike Rapoport <rppt@kernel.org>
|
||||
Cc: Roman Gushchin <roman.gushchin@linux.dev>
|
||||
Cc: Suren Baghdasaryan <surenb@google.com>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
include/linux/memcontrol.h | 10 +
|
||||
include/linux/mm_inline.h | 17 ++
|
||||
include/linux/mmzone.h | 117 +++++++++++-
|
||||
mm/memcontrol.c | 16 ++
|
||||
mm/page_alloc.c | 1 +
|
||||
mm/vmscan.c | 373 +++++++++++++++++++++++++++++++++----
|
||||
6 files changed, 499 insertions(+), 35 deletions(-)
|
||||
|
||||
--- a/include/linux/memcontrol.h
|
||||
+++ b/include/linux/memcontrol.h
|
||||
@@ -818,6 +818,11 @@ static inline void obj_cgroup_put(struct
|
||||
percpu_ref_put(&objcg->refcnt);
|
||||
}
|
||||
|
||||
+static inline bool mem_cgroup_tryget(struct mem_cgroup *memcg)
|
||||
+{
|
||||
+ return !memcg || css_tryget(&memcg->css);
|
||||
+}
|
||||
+
|
||||
static inline void mem_cgroup_put(struct mem_cgroup *memcg)
|
||||
{
|
||||
if (memcg)
|
||||
@@ -1283,6 +1288,11 @@ struct mem_cgroup *mem_cgroup_from_css(s
|
||||
return NULL;
|
||||
}
|
||||
|
||||
+static inline bool mem_cgroup_tryget(struct mem_cgroup *memcg)
|
||||
+{
|
||||
+ return true;
|
||||
+}
|
||||
+
|
||||
static inline void mem_cgroup_put(struct mem_cgroup *memcg)
|
||||
{
|
||||
}
|
||||
--- a/include/linux/mm_inline.h
|
||||
+++ b/include/linux/mm_inline.h
|
||||
@@ -112,6 +112,18 @@ static inline bool lru_gen_in_fault(void
|
||||
return current->in_lru_fault;
|
||||
}
|
||||
|
||||
+#ifdef CONFIG_MEMCG
|
||||
+static inline int lru_gen_memcg_seg(struct lruvec *lruvec)
|
||||
+{
|
||||
+ return READ_ONCE(lruvec->lrugen.seg);
|
||||
+}
|
||||
+#else
|
||||
+static inline int lru_gen_memcg_seg(struct lruvec *lruvec)
|
||||
+{
|
||||
+ return 0;
|
||||
+}
|
||||
+#endif
|
||||
+
|
||||
static inline int lru_gen_from_seq(unsigned long seq)
|
||||
{
|
||||
return seq % MAX_NR_GENS;
|
||||
@@ -287,6 +299,11 @@ static inline bool lru_gen_in_fault(void
|
||||
return false;
|
||||
}
|
||||
|
||||
+static inline int lru_gen_memcg_seg(struct lruvec *lruvec)
|
||||
+{
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
static inline bool lru_gen_add_page(struct lruvec *lruvec, struct page *page, bool reclaiming)
|
||||
{
|
||||
return false;
|
||||
--- a/include/linux/mmzone.h
|
||||
+++ b/include/linux/mmzone.h
|
||||
@@ -7,6 +7,7 @@
|
||||
|
||||
#include <linux/spinlock.h>
|
||||
#include <linux/list.h>
|
||||
+#include <linux/list_nulls.h>
|
||||
#include <linux/wait.h>
|
||||
#include <linux/bitops.h>
|
||||
#include <linux/cache.h>
|
||||
@@ -357,6 +358,15 @@ struct page_vma_mapped_walk;
|
||||
#define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF)
|
||||
#define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF)
|
||||
|
||||
+/* see the comment on MEMCG_NR_GENS */
|
||||
+enum {
|
||||
+ MEMCG_LRU_NOP,
|
||||
+ MEMCG_LRU_HEAD,
|
||||
+ MEMCG_LRU_TAIL,
|
||||
+ MEMCG_LRU_OLD,
|
||||
+ MEMCG_LRU_YOUNG,
|
||||
+};
|
||||
+
|
||||
#ifdef CONFIG_LRU_GEN
|
||||
|
||||
enum {
|
||||
@@ -416,6 +426,14 @@ struct lru_gen_page {
|
||||
atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
|
||||
/* whether the multi-gen LRU is enabled */
|
||||
bool enabled;
|
||||
+#ifdef CONFIG_MEMCG
|
||||
+ /* the memcg generation this lru_gen_page belongs to */
|
||||
+ u8 gen;
|
||||
+ /* the list segment this lru_gen_page belongs to */
|
||||
+ u8 seg;
|
||||
+ /* per-node lru_gen_page list for global reclaim */
|
||||
+ struct hlist_nulls_node list;
|
||||
+#endif
|
||||
};
|
||||
|
||||
enum {
|
||||
@@ -469,12 +487,87 @@ void lru_gen_init_lruvec(struct lruvec *
|
||||
void lru_gen_look_around(struct page_vma_mapped_walk *pvmw);
|
||||
|
||||
#ifdef CONFIG_MEMCG
|
||||
+
|
||||
+/*
|
||||
+ * For each node, memcgs are divided into two generations: the old and the
|
||||
+ * young. For each generation, memcgs are randomly sharded into multiple bins
|
||||
+ * to improve scalability. For each bin, the hlist_nulls is virtually divided
|
||||
+ * into three segments: the head, the tail and the default.
|
||||
+ *
|
||||
+ * An onlining memcg is added to the tail of a random bin in the old generation.
|
||||
+ * The eviction starts at the head of a random bin in the old generation. The
|
||||
+ * per-node memcg generation counter, whose reminder (mod MEMCG_NR_GENS) indexes
|
||||
+ * the old generation, is incremented when all its bins become empty.
|
||||
+ *
|
||||
+ * There are four operations:
|
||||
+ * 1. MEMCG_LRU_HEAD, which moves an memcg to the head of a random bin in its
|
||||
+ * current generation (old or young) and updates its "seg" to "head";
|
||||
+ * 2. MEMCG_LRU_TAIL, which moves an memcg to the tail of a random bin in its
|
||||
+ * current generation (old or young) and updates its "seg" to "tail";
|
||||
+ * 3. MEMCG_LRU_OLD, which moves an memcg to the head of a random bin in the old
|
||||
+ * generation, updates its "gen" to "old" and resets its "seg" to "default";
|
||||
+ * 4. MEMCG_LRU_YOUNG, which moves an memcg to the tail of a random bin in the
|
||||
+ * young generation, updates its "gen" to "young" and resets its "seg" to
|
||||
+ * "default".
|
||||
+ *
|
||||
+ * The events that trigger the above operations are:
|
||||
+ * 1. Exceeding the soft limit, which triggers MEMCG_LRU_HEAD;
|
||||
+ * 2. The first attempt to reclaim an memcg below low, which triggers
|
||||
+ * MEMCG_LRU_TAIL;
|
||||
+ * 3. The first attempt to reclaim an memcg below reclaimable size threshold,
|
||||
+ * which triggers MEMCG_LRU_TAIL;
|
||||
+ * 4. The second attempt to reclaim an memcg below reclaimable size threshold,
|
||||
+ * which triggers MEMCG_LRU_YOUNG;
|
||||
+ * 5. Attempting to reclaim an memcg below min, which triggers MEMCG_LRU_YOUNG;
|
||||
+ * 6. Finishing the aging on the eviction path, which triggers MEMCG_LRU_YOUNG;
|
||||
+ * 7. Offlining an memcg, which triggers MEMCG_LRU_OLD.
|
||||
+ *
|
||||
+ * Note that memcg LRU only applies to global reclaim, and the round-robin
|
||||
+ * incrementing of their max_seq counters ensures the eventual fairness to all
|
||||
+ * eligible memcgs. For memcg reclaim, it still relies on mem_cgroup_iter().
|
||||
+ */
|
||||
+#define MEMCG_NR_GENS 2
|
||||
+#define MEMCG_NR_BINS 8
|
||||
+
|
||||
+struct lru_gen_memcg {
|
||||
+ /* the per-node memcg generation counter */
|
||||
+ unsigned long seq;
|
||||
+ /* each memcg has one lru_gen_page per node */
|
||||
+ unsigned long nr_memcgs[MEMCG_NR_GENS];
|
||||
+ /* per-node lru_gen_page list for global reclaim */
|
||||
+ struct hlist_nulls_head fifo[MEMCG_NR_GENS][MEMCG_NR_BINS];
|
||||
+ /* protects the above */
|
||||
+ spinlock_t lock;
|
||||
+};
|
||||
+
|
||||
+void lru_gen_init_pgdat(struct pglist_data *pgdat);
|
||||
+
|
||||
void lru_gen_init_memcg(struct mem_cgroup *memcg);
|
||||
void lru_gen_exit_memcg(struct mem_cgroup *memcg);
|
||||
-#endif
|
||||
+void lru_gen_online_memcg(struct mem_cgroup *memcg);
|
||||
+void lru_gen_offline_memcg(struct mem_cgroup *memcg);
|
||||
+void lru_gen_release_memcg(struct mem_cgroup *memcg);
|
||||
+void lru_gen_rotate_memcg(struct lruvec *lruvec, int op);
|
||||
+
|
||||
+#else /* !CONFIG_MEMCG */
|
||||
+
|
||||
+#define MEMCG_NR_GENS 1
|
||||
+
|
||||
+struct lru_gen_memcg {
|
||||
+};
|
||||
+
|
||||
+static inline void lru_gen_init_pgdat(struct pglist_data *pgdat)
|
||||
+{
|
||||
+}
|
||||
+
|
||||
+#endif /* CONFIG_MEMCG */
|
||||
|
||||
#else /* !CONFIG_LRU_GEN */
|
||||
|
||||
+static inline void lru_gen_init_pgdat(struct pglist_data *pgdat)
|
||||
+{
|
||||
+}
|
||||
+
|
||||
static inline void lru_gen_init_lruvec(struct lruvec *lruvec)
|
||||
{
|
||||
}
|
||||
@@ -484,6 +577,7 @@ static inline void lru_gen_look_around(s
|
||||
}
|
||||
|
||||
#ifdef CONFIG_MEMCG
|
||||
+
|
||||
static inline void lru_gen_init_memcg(struct mem_cgroup *memcg)
|
||||
{
|
||||
}
|
||||
@@ -491,7 +585,24 @@ static inline void lru_gen_init_memcg(st
|
||||
static inline void lru_gen_exit_memcg(struct mem_cgroup *memcg)
|
||||
{
|
||||
}
|
||||
-#endif
|
||||
+
|
||||
+static inline void lru_gen_online_memcg(struct mem_cgroup *memcg)
|
||||
+{
|
||||
+}
|
||||
+
|
||||
+static inline void lru_gen_offline_memcg(struct mem_cgroup *memcg)
|
||||
+{
|
||||
+}
|
||||
+
|
||||
+static inline void lru_gen_release_memcg(struct mem_cgroup *memcg)
|
||||
+{
|
||||
+}
|
||||
+
|
||||
+static inline void lru_gen_rotate_memcg(struct lruvec *lruvec, int op)
|
||||
+{
|
||||
+}
|
||||
+
|
||||
+#endif /* CONFIG_MEMCG */
|
||||
|
||||
#endif /* CONFIG_LRU_GEN */
|
||||
|
||||
@@ -1105,6 +1216,8 @@ typedef struct pglist_data {
|
||||
#ifdef CONFIG_LRU_GEN
|
||||
/* kswap mm walk data */
|
||||
struct lru_gen_mm_walk mm_walk;
|
||||
+ /* lru_gen_page list */
|
||||
+ struct lru_gen_memcg memcg_lru;
|
||||
#endif
|
||||
|
||||
ZONE_PADDING(_pad2_)
|
||||
--- a/mm/memcontrol.c
|
||||
+++ b/mm/memcontrol.c
|
||||
@@ -549,6 +549,16 @@ static void mem_cgroup_update_tree(struc
|
||||
struct mem_cgroup_per_node *mz;
|
||||
struct mem_cgroup_tree_per_node *mctz;
|
||||
|
||||
+ if (lru_gen_enabled()) {
|
||||
+ struct lruvec *lruvec = &mem_cgroup_page_nodeinfo(memcg, page)->lruvec;
|
||||
+
|
||||
+ /* see the comment on MEMCG_NR_GENS */
|
||||
+ if (soft_limit_excess(memcg) && lru_gen_memcg_seg(lruvec) != MEMCG_LRU_HEAD)
|
||||
+ lru_gen_rotate_memcg(lruvec, MEMCG_LRU_HEAD);
|
||||
+
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
mctz = soft_limit_tree_from_page(page);
|
||||
if (!mctz)
|
||||
return;
|
||||
@@ -3433,6 +3443,9 @@ unsigned long mem_cgroup_soft_limit_recl
|
||||
unsigned long excess;
|
||||
unsigned long nr_scanned;
|
||||
|
||||
+ if (lru_gen_enabled())
|
||||
+ return 0;
|
||||
+
|
||||
if (order > 0)
|
||||
return 0;
|
||||
|
||||
@@ -5321,6 +5334,7 @@ static int mem_cgroup_css_online(struct
|
||||
if (unlikely(mem_cgroup_is_root(memcg)))
|
||||
queue_delayed_work(system_unbound_wq, &stats_flush_dwork,
|
||||
2UL*HZ);
|
||||
+ lru_gen_online_memcg(memcg);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -5347,6 +5361,7 @@ static void mem_cgroup_css_offline(struc
|
||||
memcg_offline_kmem(memcg);
|
||||
reparent_shrinker_deferred(memcg);
|
||||
wb_memcg_offline(memcg);
|
||||
+ lru_gen_offline_memcg(memcg);
|
||||
|
||||
drain_all_stock(memcg);
|
||||
|
||||
@@ -5358,6 +5373,7 @@ static void mem_cgroup_css_released(stru
|
||||
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
|
||||
|
||||
invalidate_reclaim_iterators(memcg);
|
||||
+ lru_gen_release_memcg(memcg);
|
||||
}
|
||||
|
||||
static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
|
||||
--- a/mm/page_alloc.c
|
||||
+++ b/mm/page_alloc.c
|
||||
@@ -7661,6 +7661,7 @@ static void __init free_area_init_node(i
|
||||
pgdat_set_deferred_range(pgdat);
|
||||
|
||||
free_area_init_core(pgdat);
|
||||
+ lru_gen_init_pgdat(pgdat);
|
||||
}
|
||||
|
||||
void __init free_area_init_memoryless_node(int nid)
|
||||
--- a/mm/vmscan.c
|
||||
+++ b/mm/vmscan.c
|
||||
@@ -54,6 +54,8 @@
|
||||
#include <linux/shmem_fs.h>
|
||||
#include <linux/ctype.h>
|
||||
#include <linux/debugfs.h>
|
||||
+#include <linux/rculist_nulls.h>
|
||||
+#include <linux/random.h>
|
||||
|
||||
#include <asm/tlbflush.h>
|
||||
#include <asm/div64.h>
|
||||
@@ -129,11 +131,6 @@ struct scan_control {
|
||||
/* Always discard instead of demoting to lower tier memory */
|
||||
unsigned int no_demotion:1;
|
||||
|
||||
-#ifdef CONFIG_LRU_GEN
|
||||
- /* help kswapd make better choices among multiple memcgs */
|
||||
- unsigned long last_reclaimed;
|
||||
-#endif
|
||||
-
|
||||
/* Allocation order */
|
||||
s8 order;
|
||||
|
||||
@@ -2880,6 +2877,9 @@ DEFINE_STATIC_KEY_ARRAY_FALSE(lru_gen_ca
|
||||
for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \
|
||||
for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++)
|
||||
|
||||
+#define get_memcg_gen(seq) ((seq) % MEMCG_NR_GENS)
|
||||
+#define get_memcg_bin(bin) ((bin) % MEMCG_NR_BINS)
|
||||
+
|
||||
static struct lruvec *get_lruvec(struct mem_cgroup *memcg, int nid)
|
||||
{
|
||||
struct pglist_data *pgdat = NODE_DATA(nid);
|
||||
@@ -4169,8 +4169,7 @@ done:
|
||||
if (sc->priority <= DEF_PRIORITY - 2)
|
||||
wait_event_killable(lruvec->mm_state.wait,
|
||||
max_seq < READ_ONCE(lrugen->max_seq));
|
||||
-
|
||||
- return max_seq < READ_ONCE(lrugen->max_seq);
|
||||
+ return false;
|
||||
}
|
||||
|
||||
VM_WARN_ON_ONCE(max_seq != READ_ONCE(lrugen->max_seq));
|
||||
@@ -4243,8 +4242,6 @@ static void lru_gen_age_node(struct pgli
|
||||
|
||||
VM_WARN_ON_ONCE(!current_is_kswapd());
|
||||
|
||||
- sc->last_reclaimed = sc->nr_reclaimed;
|
||||
-
|
||||
/* check the order to exclude compaction-induced reclaim */
|
||||
if (!min_ttl || sc->order || sc->priority == DEF_PRIORITY)
|
||||
return;
|
||||
@@ -4833,8 +4830,7 @@ static bool should_run_aging(struct lruv
|
||||
* 1. Defer try_to_inc_max_seq() to workqueues to reduce latency for memcg
|
||||
* reclaim.
|
||||
*/
|
||||
-static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc,
|
||||
- bool can_swap)
|
||||
+static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool can_swap)
|
||||
{
|
||||
unsigned long nr_to_scan;
|
||||
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
|
||||
@@ -4851,10 +4847,8 @@ static unsigned long get_nr_to_scan(stru
|
||||
if (sc->priority == DEF_PRIORITY)
|
||||
return nr_to_scan;
|
||||
|
||||
- try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false);
|
||||
-
|
||||
/* skip this lruvec as it's low on cold pages */
|
||||
- return 0;
|
||||
+ return try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false) ? -1 : 0;
|
||||
}
|
||||
|
||||
static unsigned long get_nr_to_reclaim(struct scan_control *sc)
|
||||
@@ -4863,29 +4857,18 @@ static unsigned long get_nr_to_reclaim(s
|
||||
if (!global_reclaim(sc))
|
||||
return -1;
|
||||
|
||||
- /* discount the previous progress for kswapd */
|
||||
- if (current_is_kswapd())
|
||||
- return sc->nr_to_reclaim + sc->last_reclaimed;
|
||||
-
|
||||
return max(sc->nr_to_reclaim, compact_gap(sc->order));
|
||||
}
|
||||
|
||||
-static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
|
||||
+static bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
|
||||
{
|
||||
- struct blk_plug plug;
|
||||
+ long nr_to_scan;
|
||||
unsigned long scanned = 0;
|
||||
unsigned long nr_to_reclaim = get_nr_to_reclaim(sc);
|
||||
|
||||
- lru_add_drain();
|
||||
-
|
||||
- blk_start_plug(&plug);
|
||||
-
|
||||
- set_mm_walk(lruvec_pgdat(lruvec));
|
||||
-
|
||||
while (true) {
|
||||
int delta;
|
||||
int swappiness;
|
||||
- unsigned long nr_to_scan;
|
||||
|
||||
if (sc->may_swap)
|
||||
swappiness = get_swappiness(lruvec, sc);
|
||||
@@ -4895,7 +4878,7 @@ static void lru_gen_shrink_lruvec(struct
|
||||
swappiness = 0;
|
||||
|
||||
nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness);
|
||||
- if (!nr_to_scan)
|
||||
+ if (nr_to_scan <= 0)
|
||||
break;
|
||||
|
||||
delta = evict_pages(lruvec, sc, swappiness);
|
||||
@@ -4912,10 +4895,250 @@ static void lru_gen_shrink_lruvec(struct
|
||||
cond_resched();
|
||||
}
|
||||
|
||||
+ /* whether try_to_inc_max_seq() was successful */
|
||||
+ return nr_to_scan < 0;
|
||||
+}
|
||||
+
|
||||
+static int shrink_one(struct lruvec *lruvec, struct scan_control *sc)
|
||||
+{
|
||||
+ bool success;
|
||||
+ unsigned long scanned = sc->nr_scanned;
|
||||
+ unsigned long reclaimed = sc->nr_reclaimed;
|
||||
+ int seg = lru_gen_memcg_seg(lruvec);
|
||||
+ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
|
||||
+ struct pglist_data *pgdat = lruvec_pgdat(lruvec);
|
||||
+
|
||||
+ /* see the comment on MEMCG_NR_GENS */
|
||||
+ if (!lruvec_is_sizable(lruvec, sc))
|
||||
+ return seg != MEMCG_LRU_TAIL ? MEMCG_LRU_TAIL : MEMCG_LRU_YOUNG;
|
||||
+
|
||||
+ mem_cgroup_calculate_protection(NULL, memcg);
|
||||
+
|
||||
+ if (mem_cgroup_below_min(memcg))
|
||||
+ return MEMCG_LRU_YOUNG;
|
||||
+
|
||||
+ if (mem_cgroup_below_low(memcg)) {
|
||||
+ /* see the comment on MEMCG_NR_GENS */
|
||||
+ if (seg != MEMCG_LRU_TAIL)
|
||||
+ return MEMCG_LRU_TAIL;
|
||||
+
|
||||
+ memcg_memory_event(memcg, MEMCG_LOW);
|
||||
+ }
|
||||
+
|
||||
+ success = try_to_shrink_lruvec(lruvec, sc);
|
||||
+
|
||||
+ shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, sc->priority);
|
||||
+
|
||||
+ vmpressure(sc->gfp_mask, memcg, false, sc->nr_scanned - scanned,
|
||||
+ sc->nr_reclaimed - reclaimed);
|
||||
+
|
||||
+ sc->nr_reclaimed += current->reclaim_state->reclaimed_slab;
|
||||
+ current->reclaim_state->reclaimed_slab = 0;
|
||||
+
|
||||
+ return success ? MEMCG_LRU_YOUNG : 0;
|
||||
+}
|
||||
+
|
||||
+#ifdef CONFIG_MEMCG
|
||||
+
|
||||
+static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc)
|
||||
+{
|
||||
+ int gen;
|
||||
+ int bin;
|
||||
+ int first_bin;
|
||||
+ struct lruvec *lruvec;
|
||||
+ struct lru_gen_page *lrugen;
|
||||
+ const struct hlist_nulls_node *pos;
|
||||
+ int op = 0;
|
||||
+ struct mem_cgroup *memcg = NULL;
|
||||
+ unsigned long nr_to_reclaim = get_nr_to_reclaim(sc);
|
||||
+
|
||||
+ bin = first_bin = prandom_u32_max(MEMCG_NR_BINS);
|
||||
+restart:
|
||||
+ gen = get_memcg_gen(READ_ONCE(pgdat->memcg_lru.seq));
|
||||
+
|
||||
+ rcu_read_lock();
|
||||
+
|
||||
+ hlist_nulls_for_each_entry_rcu(lrugen, pos, &pgdat->memcg_lru.fifo[gen][bin], list) {
|
||||
+ if (op)
|
||||
+ lru_gen_rotate_memcg(lruvec, op);
|
||||
+
|
||||
+ mem_cgroup_put(memcg);
|
||||
+
|
||||
+ lruvec = container_of(lrugen, struct lruvec, lrugen);
|
||||
+ memcg = lruvec_memcg(lruvec);
|
||||
+
|
||||
+ if (!mem_cgroup_tryget(memcg)) {
|
||||
+ op = 0;
|
||||
+ memcg = NULL;
|
||||
+ continue;
|
||||
+ }
|
||||
+
|
||||
+ rcu_read_unlock();
|
||||
+
|
||||
+ op = shrink_one(lruvec, sc);
|
||||
+
|
||||
+ if (sc->nr_reclaimed >= nr_to_reclaim)
|
||||
+ goto success;
|
||||
+
|
||||
+ rcu_read_lock();
|
||||
+ }
|
||||
+
|
||||
+ rcu_read_unlock();
|
||||
+
|
||||
+ /* restart if raced with lru_gen_rotate_memcg() */
|
||||
+ if (gen != get_nulls_value(pos))
|
||||
+ goto restart;
|
||||
+
|
||||
+ /* try the rest of the bins of the current generation */
|
||||
+ bin = get_memcg_bin(bin + 1);
|
||||
+ if (bin != first_bin)
|
||||
+ goto restart;
|
||||
+success:
|
||||
+ if (op)
|
||||
+ lru_gen_rotate_memcg(lruvec, op);
|
||||
+
|
||||
+ mem_cgroup_put(memcg);
|
||||
+}
|
||||
+
|
||||
+static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
|
||||
+{
|
||||
+ struct blk_plug plug;
|
||||
+
|
||||
+ VM_WARN_ON_ONCE(global_reclaim(sc));
|
||||
+
|
||||
+ lru_add_drain();
|
||||
+
|
||||
+ blk_start_plug(&plug);
|
||||
+
|
||||
+ set_mm_walk(lruvec_pgdat(lruvec));
|
||||
+
|
||||
+ if (try_to_shrink_lruvec(lruvec, sc))
|
||||
+ lru_gen_rotate_memcg(lruvec, MEMCG_LRU_YOUNG);
|
||||
+
|
||||
+ clear_mm_walk();
|
||||
+
|
||||
+ blk_finish_plug(&plug);
|
||||
+}
|
||||
+
|
||||
+#else /* !CONFIG_MEMCG */
|
||||
+
|
||||
+static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc)
|
||||
+{
|
||||
+ BUILD_BUG();
|
||||
+}
|
||||
+
|
||||
+static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
|
||||
+{
|
||||
+ BUILD_BUG();
|
||||
+}
|
||||
+
|
||||
+#endif
|
||||
+
|
||||
+static void set_initial_priority(struct pglist_data *pgdat, struct scan_control *sc)
|
||||
+{
|
||||
+ int priority;
|
||||
+ unsigned long reclaimable;
|
||||
+ struct lruvec *lruvec = mem_cgroup_lruvec(NULL, pgdat);
|
||||
+
|
||||
+ if (sc->priority != DEF_PRIORITY || sc->nr_to_reclaim < MIN_LRU_BATCH)
|
||||
+ return;
|
||||
+ /*
|
||||
+ * Determine the initial priority based on ((total / MEMCG_NR_GENS) >>
|
||||
+ * priority) * reclaimed_to_scanned_ratio = nr_to_reclaim, where the
|
||||
+ * estimated reclaimed_to_scanned_ratio = inactive / total.
|
||||
+ */
|
||||
+ reclaimable = node_page_state(pgdat, NR_INACTIVE_FILE);
|
||||
+ if (get_swappiness(lruvec, sc))
|
||||
+ reclaimable += node_page_state(pgdat, NR_INACTIVE_ANON);
|
||||
+
|
||||
+ reclaimable /= MEMCG_NR_GENS;
|
||||
+
|
||||
+ /* round down reclaimable and round up sc->nr_to_reclaim */
|
||||
+ priority = fls_long(reclaimable) - 1 - fls_long(sc->nr_to_reclaim - 1);
|
||||
+
|
||||
+ sc->priority = clamp(priority, 0, DEF_PRIORITY);
|
||||
+}
|
||||
+
|
||||
+static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *sc)
|
||||
+{
|
||||
+ struct blk_plug plug;
|
||||
+ unsigned long reclaimed = sc->nr_reclaimed;
|
||||
+
|
||||
+ VM_WARN_ON_ONCE(!global_reclaim(sc));
|
||||
+
|
||||
+ lru_add_drain();
|
||||
+
|
||||
+ blk_start_plug(&plug);
|
||||
+
|
||||
+ set_mm_walk(pgdat);
|
||||
+
|
||||
+ set_initial_priority(pgdat, sc);
|
||||
+
|
||||
+ if (current_is_kswapd())
|
||||
+ sc->nr_reclaimed = 0;
|
||||
+
|
||||
+ if (mem_cgroup_disabled())
|
||||
+ shrink_one(&pgdat->__lruvec, sc);
|
||||
+ else
|
||||
+ shrink_many(pgdat, sc);
|
||||
+
|
||||
+ if (current_is_kswapd())
|
||||
+ sc->nr_reclaimed += reclaimed;
|
||||
+
|
||||
clear_mm_walk();
|
||||
|
||||
blk_finish_plug(&plug);
|
||||
+
|
||||
+ /* kswapd should never fail */
|
||||
+ pgdat->kswapd_failures = 0;
|
||||
+}
|
||||
+
|
||||
+#ifdef CONFIG_MEMCG
|
||||
+void lru_gen_rotate_memcg(struct lruvec *lruvec, int op)
|
||||
+{
|
||||
+ int seg;
|
||||
+ int old, new;
|
||||
+ int bin = prandom_u32_max(MEMCG_NR_BINS);
|
||||
+ struct pglist_data *pgdat = lruvec_pgdat(lruvec);
|
||||
+
|
||||
+ spin_lock(&pgdat->memcg_lru.lock);
|
||||
+
|
||||
+ VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list));
|
||||
+
|
||||
+ seg = 0;
|
||||
+ new = old = lruvec->lrugen.gen;
|
||||
+
|
||||
+ /* see the comment on MEMCG_NR_GENS */
|
||||
+ if (op == MEMCG_LRU_HEAD)
|
||||
+ seg = MEMCG_LRU_HEAD;
|
||||
+ else if (op == MEMCG_LRU_TAIL)
|
||||
+ seg = MEMCG_LRU_TAIL;
|
||||
+ else if (op == MEMCG_LRU_OLD)
|
||||
+ new = get_memcg_gen(pgdat->memcg_lru.seq);
|
||||
+ else if (op == MEMCG_LRU_YOUNG)
|
||||
+ new = get_memcg_gen(pgdat->memcg_lru.seq + 1);
|
||||
+ else
|
||||
+ VM_WARN_ON_ONCE(true);
|
||||
+
|
||||
+ hlist_nulls_del_rcu(&lruvec->lrugen.list);
|
||||
+
|
||||
+ if (op == MEMCG_LRU_HEAD || op == MEMCG_LRU_OLD)
|
||||
+ hlist_nulls_add_head_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]);
|
||||
+ else
|
||||
+ hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]);
|
||||
+
|
||||
+ pgdat->memcg_lru.nr_memcgs[old]--;
|
||||
+ pgdat->memcg_lru.nr_memcgs[new]++;
|
||||
+
|
||||
+ lruvec->lrugen.gen = new;
|
||||
+ WRITE_ONCE(lruvec->lrugen.seg, seg);
|
||||
+
|
||||
+ if (!pgdat->memcg_lru.nr_memcgs[old] && old == get_memcg_gen(pgdat->memcg_lru.seq))
|
||||
+ WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1);
|
||||
+
|
||||
+ spin_unlock(&pgdat->memcg_lru.lock);
|
||||
}
|
||||
+#endif
|
||||
|
||||
/******************************************************************************
|
||||
* state change
|
||||
@@ -5370,11 +5593,11 @@ static int run_cmd(char cmd, int memcg_i
|
||||
|
||||
if (!mem_cgroup_disabled()) {
|
||||
rcu_read_lock();
|
||||
+
|
||||
memcg = mem_cgroup_from_id(memcg_id);
|
||||
-#ifdef CONFIG_MEMCG
|
||||
- if (memcg && !css_tryget(&memcg->css))
|
||||
+ if (!mem_cgroup_tryget(memcg))
|
||||
memcg = NULL;
|
||||
-#endif
|
||||
+
|
||||
rcu_read_unlock();
|
||||
|
||||
if (!memcg)
|
||||
@@ -5521,6 +5744,19 @@ void lru_gen_init_lruvec(struct lruvec *
|
||||
}
|
||||
|
||||
#ifdef CONFIG_MEMCG
|
||||
+
|
||||
+void lru_gen_init_pgdat(struct pglist_data *pgdat)
|
||||
+{
|
||||
+ int i, j;
|
||||
+
|
||||
+ spin_lock_init(&pgdat->memcg_lru.lock);
|
||||
+
|
||||
+ for (i = 0; i < MEMCG_NR_GENS; i++) {
|
||||
+ for (j = 0; j < MEMCG_NR_BINS; j++)
|
||||
+ INIT_HLIST_NULLS_HEAD(&pgdat->memcg_lru.fifo[i][j], i);
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
void lru_gen_init_memcg(struct mem_cgroup *memcg)
|
||||
{
|
||||
INIT_LIST_HEAD(&memcg->mm_list.fifo);
|
||||
@@ -5544,7 +5780,69 @@ void lru_gen_exit_memcg(struct mem_cgrou
|
||||
}
|
||||
}
|
||||
}
|
||||
-#endif
|
||||
+
|
||||
+void lru_gen_online_memcg(struct mem_cgroup *memcg)
|
||||
+{
|
||||
+ int gen;
|
||||
+ int nid;
|
||||
+ int bin = prandom_u32_max(MEMCG_NR_BINS);
|
||||
+
|
||||
+ for_each_node(nid) {
|
||||
+ struct pglist_data *pgdat = NODE_DATA(nid);
|
||||
+ struct lruvec *lruvec = get_lruvec(memcg, nid);
|
||||
+
|
||||
+ spin_lock(&pgdat->memcg_lru.lock);
|
||||
+
|
||||
+ VM_WARN_ON_ONCE(!hlist_nulls_unhashed(&lruvec->lrugen.list));
|
||||
+
|
||||
+ gen = get_memcg_gen(pgdat->memcg_lru.seq);
|
||||
+
|
||||
+ hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[gen][bin]);
|
||||
+ pgdat->memcg_lru.nr_memcgs[gen]++;
|
||||
+
|
||||
+ lruvec->lrugen.gen = gen;
|
||||
+
|
||||
+ spin_unlock(&pgdat->memcg_lru.lock);
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
+void lru_gen_offline_memcg(struct mem_cgroup *memcg)
|
||||
+{
|
||||
+ int nid;
|
||||
+
|
||||
+ for_each_node(nid) {
|
||||
+ struct lruvec *lruvec = get_lruvec(memcg, nid);
|
||||
+
|
||||
+ lru_gen_rotate_memcg(lruvec, MEMCG_LRU_OLD);
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
+void lru_gen_release_memcg(struct mem_cgroup *memcg)
|
||||
+{
|
||||
+ int gen;
|
||||
+ int nid;
|
||||
+
|
||||
+ for_each_node(nid) {
|
||||
+ struct pglist_data *pgdat = NODE_DATA(nid);
|
||||
+ struct lruvec *lruvec = get_lruvec(memcg, nid);
|
||||
+
|
||||
+ spin_lock(&pgdat->memcg_lru.lock);
|
||||
+
|
||||
+ VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list));
|
||||
+
|
||||
+ gen = lruvec->lrugen.gen;
|
||||
+
|
||||
+ hlist_nulls_del_rcu(&lruvec->lrugen.list);
|
||||
+ pgdat->memcg_lru.nr_memcgs[gen]--;
|
||||
+
|
||||
+ if (!pgdat->memcg_lru.nr_memcgs[gen] && gen == get_memcg_gen(pgdat->memcg_lru.seq))
|
||||
+ WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1);
|
||||
+
|
||||
+ spin_unlock(&pgdat->memcg_lru.lock);
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
+#endif /* CONFIG_MEMCG */
|
||||
|
||||
static int __init init_lru_gen(void)
|
||||
{
|
||||
@@ -5571,6 +5869,10 @@ static void lru_gen_shrink_lruvec(struct
|
||||
{
|
||||
}
|
||||
|
||||
+static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *sc)
|
||||
+{
|
||||
+}
|
||||
+
|
||||
#endif /* CONFIG_LRU_GEN */
|
||||
|
||||
static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
|
||||
@@ -5584,7 +5886,7 @@ static void shrink_lruvec(struct lruvec
|
||||
bool proportional_reclaim;
|
||||
struct blk_plug plug;
|
||||
|
||||
- if (lru_gen_enabled()) {
|
||||
+ if (lru_gen_enabled() && !global_reclaim(sc)) {
|
||||
lru_gen_shrink_lruvec(lruvec, sc);
|
||||
return;
|
||||
}
|
||||
@@ -5826,6 +6128,11 @@ static void shrink_node(pg_data_t *pgdat
|
||||
struct lruvec *target_lruvec;
|
||||
bool reclaimable = false;
|
||||
|
||||
+ if (lru_gen_enabled() && global_reclaim(sc)) {
|
||||
+ lru_gen_shrink_node(pgdat, sc);
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
|
||||
|
||||
again:
|
@ -0,0 +1,196 @@
|
||||
From 93147736b5b3a21bea24313bfc7a696829932009 Mon Sep 17 00:00:00 2001
|
||||
From: Yu Zhao <yuzhao@google.com>
|
||||
Date: Wed, 21 Dec 2022 21:19:05 -0700
|
||||
Subject: [PATCH 27/29] mm: multi-gen LRU: clarify scan_control flags
|
||||
|
||||
Among the flags in scan_control:
|
||||
1. sc->may_swap, which indicates swap constraint due to memsw.max, is
|
||||
supported as usual.
|
||||
2. sc->proactive, which indicates reclaim by memory.reclaim, may not
|
||||
opportunistically skip the aging path, since it is considered less
|
||||
latency sensitive.
|
||||
3. !(sc->gfp_mask & __GFP_IO), which indicates IO constraint, lowers
|
||||
swappiness to prioritize file LRU, since clean file pages are more
|
||||
likely to exist.
|
||||
4. sc->may_writepage and sc->may_unmap, which indicates opportunistic
|
||||
reclaim, are rejected, since unmapped clean pages are already
|
||||
prioritized. Scanning for more of them is likely futile and can
|
||||
cause high reclaim latency when there is a large number of memcgs.
|
||||
|
||||
The rest are handled by the existing code.
|
||||
|
||||
Link: https://lkml.kernel.org/r/20221222041905.2431096-8-yuzhao@google.com
|
||||
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
||||
Cc: Johannes Weiner <hannes@cmpxchg.org>
|
||||
Cc: Jonathan Corbet <corbet@lwn.net>
|
||||
Cc: Michael Larabel <Michael@MichaelLarabel.com>
|
||||
Cc: Michal Hocko <mhocko@kernel.org>
|
||||
Cc: Mike Rapoport <rppt@kernel.org>
|
||||
Cc: Roman Gushchin <roman.gushchin@linux.dev>
|
||||
Cc: Suren Baghdasaryan <surenb@google.com>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
mm/vmscan.c | 55 +++++++++++++++++++++++++++--------------------------
|
||||
1 file changed, 28 insertions(+), 27 deletions(-)
|
||||
|
||||
--- a/mm/vmscan.c
|
||||
+++ b/mm/vmscan.c
|
||||
@@ -2905,6 +2905,9 @@ static int get_swappiness(struct lruvec
|
||||
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
|
||||
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
|
||||
|
||||
+ if (!sc->may_swap)
|
||||
+ return 0;
|
||||
+
|
||||
if (!can_demote(pgdat->node_id, sc) &&
|
||||
mem_cgroup_get_nr_swap_pages(memcg) < MIN_LRU_BATCH)
|
||||
return 0;
|
||||
@@ -3952,7 +3955,7 @@ static void walk_mm(struct lruvec *lruve
|
||||
} while (err == -EAGAIN);
|
||||
}
|
||||
|
||||
-static struct lru_gen_mm_walk *set_mm_walk(struct pglist_data *pgdat)
|
||||
+static struct lru_gen_mm_walk *set_mm_walk(struct pglist_data *pgdat, bool force_alloc)
|
||||
{
|
||||
struct lru_gen_mm_walk *walk = current->reclaim_state->mm_walk;
|
||||
|
||||
@@ -3960,7 +3963,7 @@ static struct lru_gen_mm_walk *set_mm_wa
|
||||
VM_WARN_ON_ONCE(walk);
|
||||
|
||||
walk = &pgdat->mm_walk;
|
||||
- } else if (!pgdat && !walk) {
|
||||
+ } else if (!walk && force_alloc) {
|
||||
VM_WARN_ON_ONCE(current_is_kswapd());
|
||||
|
||||
walk = kzalloc(sizeof(*walk), __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN);
|
||||
@@ -4146,7 +4149,7 @@ static bool try_to_inc_max_seq(struct lr
|
||||
goto done;
|
||||
}
|
||||
|
||||
- walk = set_mm_walk(NULL);
|
||||
+ walk = set_mm_walk(NULL, true);
|
||||
if (!walk) {
|
||||
success = iterate_mm_list_nowalk(lruvec, max_seq);
|
||||
goto done;
|
||||
@@ -4215,8 +4218,6 @@ static bool lruvec_is_reclaimable(struct
|
||||
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
|
||||
DEFINE_MIN_SEQ(lruvec);
|
||||
|
||||
- VM_WARN_ON_ONCE(sc->memcg_low_reclaim);
|
||||
-
|
||||
/* see the comment on lru_gen_page */
|
||||
gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]);
|
||||
birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);
|
||||
@@ -4472,12 +4473,8 @@ static bool isolate_page(struct lruvec *
|
||||
{
|
||||
bool success;
|
||||
|
||||
- /* unmapping inhibited */
|
||||
- if (!sc->may_unmap && page_mapped(page))
|
||||
- return false;
|
||||
-
|
||||
/* swapping inhibited */
|
||||
- if (!(sc->may_writepage && (sc->gfp_mask & __GFP_IO)) &&
|
||||
+ if (!(sc->gfp_mask & __GFP_IO) &&
|
||||
(PageDirty(page) ||
|
||||
(PageAnon(page) && !PageSwapCache(page))))
|
||||
return false;
|
||||
@@ -4574,9 +4571,8 @@ static int scan_pages(struct lruvec *lru
|
||||
__count_vm_events(PGSCAN_ANON + type, isolated);
|
||||
|
||||
/*
|
||||
- * There might not be eligible pages due to reclaim_idx, may_unmap and
|
||||
- * may_writepage. Check the remaining to prevent livelock if it's not
|
||||
- * making progress.
|
||||
+ * There might not be eligible pages due to reclaim_idx. Check the
|
||||
+ * remaining to prevent livelock if it's not making progress.
|
||||
*/
|
||||
return isolated || !remaining ? scanned : 0;
|
||||
}
|
||||
@@ -4836,8 +4832,7 @@ static long get_nr_to_scan(struct lruvec
|
||||
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
|
||||
DEFINE_MAX_SEQ(lruvec);
|
||||
|
||||
- if (mem_cgroup_below_min(memcg) ||
|
||||
- (mem_cgroup_below_low(memcg) && !sc->memcg_low_reclaim))
|
||||
+ if (mem_cgroup_below_min(memcg))
|
||||
return 0;
|
||||
|
||||
if (!should_run_aging(lruvec, max_seq, sc, can_swap, &nr_to_scan))
|
||||
@@ -4865,17 +4860,14 @@ static bool try_to_shrink_lruvec(struct
|
||||
long nr_to_scan;
|
||||
unsigned long scanned = 0;
|
||||
unsigned long nr_to_reclaim = get_nr_to_reclaim(sc);
|
||||
+ int swappiness = get_swappiness(lruvec, sc);
|
||||
+
|
||||
+ /* clean file pages are more likely to exist */
|
||||
+ if (swappiness && !(sc->gfp_mask & __GFP_IO))
|
||||
+ swappiness = 1;
|
||||
|
||||
while (true) {
|
||||
int delta;
|
||||
- int swappiness;
|
||||
-
|
||||
- if (sc->may_swap)
|
||||
- swappiness = get_swappiness(lruvec, sc);
|
||||
- else if (!cgroup_reclaim(sc) && get_swappiness(lruvec, sc))
|
||||
- swappiness = 1;
|
||||
- else
|
||||
- swappiness = 0;
|
||||
|
||||
nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness);
|
||||
if (nr_to_scan <= 0)
|
||||
@@ -5005,12 +4997,13 @@ static void lru_gen_shrink_lruvec(struct
|
||||
struct blk_plug plug;
|
||||
|
||||
VM_WARN_ON_ONCE(global_reclaim(sc));
|
||||
+ VM_WARN_ON_ONCE(!sc->may_writepage || !sc->may_unmap);
|
||||
|
||||
lru_add_drain();
|
||||
|
||||
blk_start_plug(&plug);
|
||||
|
||||
- set_mm_walk(lruvec_pgdat(lruvec));
|
||||
+ set_mm_walk(NULL, false);
|
||||
|
||||
if (try_to_shrink_lruvec(lruvec, sc))
|
||||
lru_gen_rotate_memcg(lruvec, MEMCG_LRU_YOUNG);
|
||||
@@ -5066,11 +5059,19 @@ static void lru_gen_shrink_node(struct p
|
||||
|
||||
VM_WARN_ON_ONCE(!global_reclaim(sc));
|
||||
|
||||
+ /*
|
||||
+ * Unmapped clean pages are already prioritized. Scanning for more of
|
||||
+ * them is likely futile and can cause high reclaim latency when there
|
||||
+ * is a large number of memcgs.
|
||||
+ */
|
||||
+ if (!sc->may_writepage || !sc->may_unmap)
|
||||
+ goto done;
|
||||
+
|
||||
lru_add_drain();
|
||||
|
||||
blk_start_plug(&plug);
|
||||
|
||||
- set_mm_walk(pgdat);
|
||||
+ set_mm_walk(pgdat, false);
|
||||
|
||||
set_initial_priority(pgdat, sc);
|
||||
|
||||
@@ -5088,7 +5089,7 @@ static void lru_gen_shrink_node(struct p
|
||||
clear_mm_walk();
|
||||
|
||||
blk_finish_plug(&plug);
|
||||
-
|
||||
+done:
|
||||
/* kswapd should never fail */
|
||||
pgdat->kswapd_failures = 0;
|
||||
}
|
||||
@@ -5656,7 +5657,7 @@ static ssize_t lru_gen_seq_write(struct
|
||||
set_task_reclaim_state(current, &sc.reclaim_state);
|
||||
flags = memalloc_noreclaim_save();
|
||||
blk_start_plug(&plug);
|
||||
- if (!set_mm_walk(NULL)) {
|
||||
+ if (!set_mm_walk(NULL, true)) {
|
||||
err = -ENOMEM;
|
||||
goto done;
|
||||
}
|
@ -0,0 +1,34 @@
|
||||
From cf3297e4c7a928da8b2b2f0baff2f9c69ea57952 Mon Sep 17 00:00:00 2001
|
||||
From: Yu Zhao <yuzhao@google.com>
|
||||
Date: Wed, 21 Dec 2022 21:19:06 -0700
|
||||
Subject: [PATCH 28/29] mm: multi-gen LRU: simplify arch_has_hw_pte_young()
|
||||
check
|
||||
|
||||
Scanning page tables when hardware does not set the accessed bit has
|
||||
no real use cases.
|
||||
|
||||
Link: https://lkml.kernel.org/r/20221222041905.2431096-9-yuzhao@google.com
|
||||
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
||||
Cc: Johannes Weiner <hannes@cmpxchg.org>
|
||||
Cc: Jonathan Corbet <corbet@lwn.net>
|
||||
Cc: Michael Larabel <Michael@MichaelLarabel.com>
|
||||
Cc: Michal Hocko <mhocko@kernel.org>
|
||||
Cc: Mike Rapoport <rppt@kernel.org>
|
||||
Cc: Roman Gushchin <roman.gushchin@linux.dev>
|
||||
Cc: Suren Baghdasaryan <surenb@google.com>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
mm/vmscan.c | 2 +-
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
--- a/mm/vmscan.c
|
||||
+++ b/mm/vmscan.c
|
||||
@@ -4144,7 +4144,7 @@ static bool try_to_inc_max_seq(struct lr
|
||||
* handful of PTEs. Spreading the work out over a period of time usually
|
||||
* is less efficient, but it avoids bursty page faults.
|
||||
*/
|
||||
- if (!force_scan && !(arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK))) {
|
||||
+ if (!arch_has_hw_pte_young() || !get_cap(LRU_GEN_MM_WALK)) {
|
||||
success = iterate_mm_list_nowalk(lruvec, max_seq);
|
||||
goto done;
|
||||
}
|
@ -0,0 +1,88 @@
|
||||
From cc67f962cc53f6e1dfa92eb85b7b26fe83a3c66f Mon Sep 17 00:00:00 2001
|
||||
From: Yu Zhao <yuzhao@google.com>
|
||||
Date: Mon, 13 Feb 2023 00:53:22 -0700
|
||||
Subject: [PATCH 29/29] mm: multi-gen LRU: avoid futile retries
|
||||
|
||||
Recall that the per-node memcg LRU has two generations and they alternate
|
||||
when the last memcg (of a given node) is moved from one to the other.
|
||||
Each generation is also sharded into multiple bins to improve scalability.
|
||||
A reclaimer starts with a random bin (in the old generation) and, if it
|
||||
fails, it will retry, i.e., to try the rest of the bins.
|
||||
|
||||
If a reclaimer fails with the last memcg, it should move this memcg to the
|
||||
young generation first, which causes the generations to alternate, and
|
||||
then retry. Otherwise, the retries will be futile because all other bins
|
||||
are empty.
|
||||
|
||||
Link: https://lkml.kernel.org/r/20230213075322.1416966-1-yuzhao@google.com
|
||||
Fixes: e4dde56cd208 ("mm: multi-gen LRU: per-node lru_gen_folio lists")
|
||||
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
||||
Reported-by: T.J. Mercier <tjmercier@google.com>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
mm/vmscan.c | 25 +++++++++++++++----------
|
||||
1 file changed, 15 insertions(+), 10 deletions(-)
|
||||
|
||||
--- a/mm/vmscan.c
|
||||
+++ b/mm/vmscan.c
|
||||
@@ -4934,18 +4934,20 @@ static int shrink_one(struct lruvec *lru
|
||||
|
||||
static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc)
|
||||
{
|
||||
+ int op;
|
||||
int gen;
|
||||
int bin;
|
||||
int first_bin;
|
||||
struct lruvec *lruvec;
|
||||
struct lru_gen_page *lrugen;
|
||||
+ struct mem_cgroup *memcg;
|
||||
const struct hlist_nulls_node *pos;
|
||||
- int op = 0;
|
||||
- struct mem_cgroup *memcg = NULL;
|
||||
unsigned long nr_to_reclaim = get_nr_to_reclaim(sc);
|
||||
|
||||
bin = first_bin = prandom_u32_max(MEMCG_NR_BINS);
|
||||
restart:
|
||||
+ op = 0;
|
||||
+ memcg = NULL;
|
||||
gen = get_memcg_gen(READ_ONCE(pgdat->memcg_lru.seq));
|
||||
|
||||
rcu_read_lock();
|
||||
@@ -4969,14 +4971,22 @@ restart:
|
||||
|
||||
op = shrink_one(lruvec, sc);
|
||||
|
||||
- if (sc->nr_reclaimed >= nr_to_reclaim)
|
||||
- goto success;
|
||||
-
|
||||
rcu_read_lock();
|
||||
+
|
||||
+ if (sc->nr_reclaimed >= nr_to_reclaim)
|
||||
+ break;
|
||||
}
|
||||
|
||||
rcu_read_unlock();
|
||||
|
||||
+ if (op)
|
||||
+ lru_gen_rotate_memcg(lruvec, op);
|
||||
+
|
||||
+ mem_cgroup_put(memcg);
|
||||
+
|
||||
+ if (sc->nr_reclaimed >= nr_to_reclaim)
|
||||
+ return;
|
||||
+
|
||||
/* restart if raced with lru_gen_rotate_memcg() */
|
||||
if (gen != get_nulls_value(pos))
|
||||
goto restart;
|
||||
@@ -4985,11 +4995,6 @@ restart:
|
||||
bin = get_memcg_bin(bin + 1);
|
||||
if (bin != first_bin)
|
||||
goto restart;
|
||||
-success:
|
||||
- if (op)
|
||||
- lru_gen_rotate_memcg(lruvec, op);
|
||||
-
|
||||
- mem_cgroup_put(memcg);
|
||||
}
|
||||
|
||||
static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
|
@ -0,0 +1,65 @@
|
||||
From: Johan Almbladh <johan.almbladh@anyfinetworks.com>
|
||||
Date: Tue, 5 Oct 2021 18:54:02 +0200
|
||||
Subject: [PATCH] MIPS: uasm: Enable muhu opcode for MIPS R6
|
||||
|
||||
Enable the 'muhu' instruction, complementing the existing 'mulu', needed
|
||||
to implement a MIPS32 BPF JIT.
|
||||
|
||||
Also fix a typo in the existing definition of 'dmulu'.
|
||||
|
||||
Signed-off-by: Tony Ambardar <Tony.Ambardar@gmail.com>
|
||||
|
||||
This patch is a dependency for my 32-bit MIPS eBPF JIT.
|
||||
|
||||
Signed-off-by: Johan Almbladh <johan.almbladh@anyfinetworks.com>
|
||||
---
|
||||
|
||||
--- a/arch/mips/include/asm/uasm.h
|
||||
+++ b/arch/mips/include/asm/uasm.h
|
||||
@@ -145,6 +145,7 @@ Ip_u1(_mtlo);
|
||||
Ip_u3u1u2(_mul);
|
||||
Ip_u1u2(_multu);
|
||||
Ip_u3u1u2(_mulu);
|
||||
+Ip_u3u1u2(_muhu);
|
||||
Ip_u3u1u2(_nor);
|
||||
Ip_u3u1u2(_or);
|
||||
Ip_u2u1u3(_ori);
|
||||
--- a/arch/mips/mm/uasm-mips.c
|
||||
+++ b/arch/mips/mm/uasm-mips.c
|
||||
@@ -90,7 +90,7 @@ static const struct insn insn_table[insn
|
||||
RS | RT | RD},
|
||||
[insn_dmtc0] = {M(cop0_op, dmtc_op, 0, 0, 0, 0), RT | RD | SET},
|
||||
[insn_dmultu] = {M(spec_op, 0, 0, 0, 0, dmultu_op), RS | RT},
|
||||
- [insn_dmulu] = {M(spec_op, 0, 0, 0, dmult_dmul_op, dmultu_op),
|
||||
+ [insn_dmulu] = {M(spec_op, 0, 0, 0, dmultu_dmulu_op, dmultu_op),
|
||||
RS | RT | RD},
|
||||
[insn_drotr] = {M(spec_op, 1, 0, 0, 0, dsrl_op), RT | RD | RE},
|
||||
[insn_drotr32] = {M(spec_op, 1, 0, 0, 0, dsrl32_op), RT | RD | RE},
|
||||
@@ -150,6 +150,8 @@ static const struct insn insn_table[insn
|
||||
[insn_mtlo] = {M(spec_op, 0, 0, 0, 0, mtlo_op), RS},
|
||||
[insn_mulu] = {M(spec_op, 0, 0, 0, multu_mulu_op, multu_op),
|
||||
RS | RT | RD},
|
||||
+ [insn_muhu] = {M(spec_op, 0, 0, 0, multu_muhu_op, multu_op),
|
||||
+ RS | RT | RD},
|
||||
#ifndef CONFIG_CPU_MIPSR6
|
||||
[insn_mul] = {M(spec2_op, 0, 0, 0, 0, mul_op), RS | RT | RD},
|
||||
#else
|
||||
--- a/arch/mips/mm/uasm.c
|
||||
+++ b/arch/mips/mm/uasm.c
|
||||
@@ -59,7 +59,7 @@ enum opcode {
|
||||
insn_lddir, insn_ldpte, insn_ldx, insn_lh, insn_lhu, insn_ll, insn_lld,
|
||||
insn_lui, insn_lw, insn_lwu, insn_lwx, insn_mfc0, insn_mfhc0, insn_mfhi,
|
||||
insn_mflo, insn_modu, insn_movn, insn_movz, insn_mtc0, insn_mthc0,
|
||||
- insn_mthi, insn_mtlo, insn_mul, insn_multu, insn_mulu, insn_nor,
|
||||
+ insn_mthi, insn_mtlo, insn_mul, insn_multu, insn_mulu, insn_muhu, insn_nor,
|
||||
insn_or, insn_ori, insn_pref, insn_rfe, insn_rotr, insn_sb, insn_sc,
|
||||
insn_scd, insn_seleqz, insn_selnez, insn_sd, insn_sh, insn_sll,
|
||||
insn_sllv, insn_slt, insn_slti, insn_sltiu, insn_sltu, insn_sra,
|
||||
@@ -344,6 +344,7 @@ I_u1(_mtlo)
|
||||
I_u3u1u2(_mul)
|
||||
I_u1u2(_multu)
|
||||
I_u3u1u2(_mulu)
|
||||
+I_u3u1u2(_muhu)
|
||||
I_u3u1u2(_nor)
|
||||
I_u3u1u2(_or)
|
||||
I_u2u1u3(_ori)
|
@ -0,0 +1,31 @@
|
||||
From: Johan Almbladh <johan.almbladh@anyfinetworks.com>
|
||||
Date: Tue, 5 Oct 2021 18:54:03 +0200
|
||||
Subject: [PATCH] mips: uasm: Add workaround for Loongson-2F nop CPU errata
|
||||
|
||||
This patch implements a workaround for the Loongson-2F nop in generated,
|
||||
code, if the existing option CONFIG_CPU_NOP_WORKAROUND is set. Before,
|
||||
the binutils option -mfix-loongson2f-nop was enabled, but no workaround
|
||||
was done when emitting MIPS code. Now, the nop pseudo instruction is
|
||||
emitted as "or ax,ax,zero" instead of the default "sll zero,zero,0". This
|
||||
is consistent with the workaround implemented by binutils.
|
||||
|
||||
Link: https://sourceware.org/legacy-ml/binutils/2009-11/msg00387.html
|
||||
|
||||
Signed-off-by: Johan Almbladh <johan.almbladh@anyfinetworks.com>
|
||||
Reviewed-by: Jiaxun Yang <jiaxun.yang@flygoat.com>
|
||||
---
|
||||
|
||||
--- a/arch/mips/include/asm/uasm.h
|
||||
+++ b/arch/mips/include/asm/uasm.h
|
||||
@@ -249,7 +249,11 @@ static inline void uasm_l##lb(struct uas
|
||||
#define uasm_i_bnezl(buf, rs, off) uasm_i_bnel(buf, rs, 0, off)
|
||||
#define uasm_i_ehb(buf) uasm_i_sll(buf, 0, 0, 3)
|
||||
#define uasm_i_move(buf, a, b) UASM_i_ADDU(buf, a, 0, b)
|
||||
+#ifdef CONFIG_CPU_NOP_WORKAROUNDS
|
||||
+#define uasm_i_nop(buf) uasm_i_or(buf, 1, 1, 0)
|
||||
+#else
|
||||
#define uasm_i_nop(buf) uasm_i_sll(buf, 0, 0, 0)
|
||||
+#endif
|
||||
#define uasm_i_ssnop(buf) uasm_i_sll(buf, 0, 0, 1)
|
||||
|
||||
static inline void uasm_i_drotr_safe(u32 **p, unsigned int a1,
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,120 @@
|
||||
From: Johan Almbladh <johan.almbladh@anyfinetworks.com>
|
||||
Date: Tue, 5 Oct 2021 18:54:06 +0200
|
||||
Subject: [PATCH] mips: bpf: Add JIT workarounds for CPU errata
|
||||
|
||||
This patch adds workarounds for the following CPU errata to the MIPS
|
||||
eBPF JIT, if enabled in the kernel configuration.
|
||||
|
||||
- R10000 ll/sc weak ordering
|
||||
- Loongson-3 ll/sc weak ordering
|
||||
- Loongson-2F jump hang
|
||||
|
||||
The Loongson-2F nop errata is implemented in uasm, which the JIT uses,
|
||||
so no additional mitigations are needed for that.
|
||||
|
||||
Signed-off-by: Johan Almbladh <johan.almbladh@anyfinetworks.com>
|
||||
Reviewed-by: Jiaxun Yang <jiaxun.yang@flygoat.com>
|
||||
---
|
||||
|
||||
--- a/arch/mips/net/bpf_jit_comp.c
|
||||
+++ b/arch/mips/net/bpf_jit_comp.c
|
||||
@@ -404,6 +404,7 @@ void emit_alu_r(struct jit_context *ctx,
|
||||
/* Atomic read-modify-write (32-bit) */
|
||||
void emit_atomic_r(struct jit_context *ctx, u8 dst, u8 src, s16 off, u8 code)
|
||||
{
|
||||
+ LLSC_sync(ctx);
|
||||
emit(ctx, ll, MIPS_R_T9, off, dst);
|
||||
switch (code) {
|
||||
case BPF_ADD:
|
||||
@@ -420,18 +421,19 @@ void emit_atomic_r(struct jit_context *c
|
||||
break;
|
||||
}
|
||||
emit(ctx, sc, MIPS_R_T8, off, dst);
|
||||
- emit(ctx, beqz, MIPS_R_T8, -16);
|
||||
+ emit(ctx, LLSC_beqz, MIPS_R_T8, -16 - LLSC_offset);
|
||||
emit(ctx, nop); /* Delay slot */
|
||||
}
|
||||
|
||||
/* Atomic compare-and-exchange (32-bit) */
|
||||
void emit_cmpxchg_r(struct jit_context *ctx, u8 dst, u8 src, u8 res, s16 off)
|
||||
{
|
||||
+ LLSC_sync(ctx);
|
||||
emit(ctx, ll, MIPS_R_T9, off, dst);
|
||||
emit(ctx, bne, MIPS_R_T9, res, 12);
|
||||
emit(ctx, move, MIPS_R_T8, src); /* Delay slot */
|
||||
emit(ctx, sc, MIPS_R_T8, off, dst);
|
||||
- emit(ctx, beqz, MIPS_R_T8, -20);
|
||||
+ emit(ctx, LLSC_beqz, MIPS_R_T8, -20 - LLSC_offset);
|
||||
emit(ctx, move, res, MIPS_R_T9); /* Delay slot */
|
||||
clobber_reg(ctx, res);
|
||||
}
|
||||
--- a/arch/mips/net/bpf_jit_comp.h
|
||||
+++ b/arch/mips/net/bpf_jit_comp.h
|
||||
@@ -87,7 +87,7 @@ struct jit_context {
|
||||
};
|
||||
|
||||
/* Emit the instruction if the JIT memory space has been allocated */
|
||||
-#define emit(ctx, func, ...) \
|
||||
+#define __emit(ctx, func, ...) \
|
||||
do { \
|
||||
if ((ctx)->target != NULL) { \
|
||||
u32 *p = &(ctx)->target[ctx->jit_index]; \
|
||||
@@ -95,6 +95,30 @@ do { \
|
||||
} \
|
||||
(ctx)->jit_index++; \
|
||||
} while (0)
|
||||
+#define emit(...) __emit(__VA_ARGS__)
|
||||
+
|
||||
+/* Workaround for R10000 ll/sc errata */
|
||||
+#ifdef CONFIG_WAR_R10000
|
||||
+#define LLSC_beqz beqzl
|
||||
+#else
|
||||
+#define LLSC_beqz beqz
|
||||
+#endif
|
||||
+
|
||||
+/* Workaround for Loongson-3 ll/sc errata */
|
||||
+#ifdef CONFIG_CPU_LOONGSON3_WORKAROUNDS
|
||||
+#define LLSC_sync(ctx) emit(ctx, sync, 0)
|
||||
+#define LLSC_offset 4
|
||||
+#else
|
||||
+#define LLSC_sync(ctx)
|
||||
+#define LLSC_offset 0
|
||||
+#endif
|
||||
+
|
||||
+/* Workaround for Loongson-2F jump errata */
|
||||
+#ifdef CONFIG_CPU_JUMP_WORKAROUNDS
|
||||
+#define JALR_MASK 0xffffffffcfffffffULL
|
||||
+#else
|
||||
+#define JALR_MASK (~0ULL)
|
||||
+#endif
|
||||
|
||||
/*
|
||||
* Mark a BPF register as accessed, it needs to be
|
||||
--- a/arch/mips/net/bpf_jit_comp64.c
|
||||
+++ b/arch/mips/net/bpf_jit_comp64.c
|
||||
@@ -375,6 +375,7 @@ static void emit_atomic_r64(struct jit_c
|
||||
u8 t1 = MIPS_R_T6;
|
||||
u8 t2 = MIPS_R_T7;
|
||||
|
||||
+ LLSC_sync(ctx);
|
||||
emit(ctx, lld, t1, off, dst);
|
||||
switch (code) {
|
||||
case BPF_ADD:
|
||||
@@ -391,7 +392,7 @@ static void emit_atomic_r64(struct jit_c
|
||||
break;
|
||||
}
|
||||
emit(ctx, scd, t2, off, dst);
|
||||
- emit(ctx, beqz, t2, -16);
|
||||
+ emit(ctx, LLSC_beqz, t2, -16 - LLSC_offset);
|
||||
emit(ctx, nop); /* Delay slot */
|
||||
}
|
||||
|
||||
@@ -414,7 +415,7 @@ static int emit_call(struct jit_context
|
||||
push_regs(ctx, ctx->clobbered & JIT_CALLER_REGS, 0, 0);
|
||||
|
||||
/* Emit function call */
|
||||
- emit_mov_i64(ctx, tmp, addr);
|
||||
+ emit_mov_i64(ctx, tmp, addr & JALR_MASK);
|
||||
emit(ctx, jalr, MIPS_R_RA, tmp);
|
||||
emit(ctx, nop); /* Delay slot */
|
||||
|
@ -0,0 +1,61 @@
|
||||
From: Johan Almbladh <johan.almbladh@anyfinetworks.com>
|
||||
Date: Tue, 5 Oct 2021 18:54:07 +0200
|
||||
Subject: [PATCH] mips: bpf: Enable eBPF JITs
|
||||
|
||||
This patch enables the new eBPF JITs for 32-bit and 64-bit MIPS. It also
|
||||
disables the old cBPF JIT to so cBPF programs are converted to use the
|
||||
new JIT.
|
||||
|
||||
Workarounds for R4000 CPU errata are not implemented by the JIT, so the
|
||||
JIT is disabled if any of those workarounds are configured.
|
||||
|
||||
Signed-off-by: Johan Almbladh <johan.almbladh@anyfinetworks.com>
|
||||
---
|
||||
|
||||
--- a/MAINTAINERS
|
||||
+++ b/MAINTAINERS
|
||||
@@ -3431,6 +3431,7 @@ S: Supported
|
||||
F: arch/arm64/net/
|
||||
|
||||
BPF JIT for MIPS (32-BIT AND 64-BIT)
|
||||
+M: Johan Almbladh <johan.almbladh@anyfinetworks.com>
|
||||
M: Paul Burton <paulburton@kernel.org>
|
||||
L: netdev@vger.kernel.org
|
||||
L: bpf@vger.kernel.org
|
||||
--- a/arch/mips/Kconfig
|
||||
+++ b/arch/mips/Kconfig
|
||||
@@ -57,7 +57,6 @@ config MIPS
|
||||
select HAVE_ARCH_TRACEHOOK
|
||||
select HAVE_ARCH_TRANSPARENT_HUGEPAGE if CPU_SUPPORTS_HUGEPAGES
|
||||
select HAVE_ASM_MODVERSIONS
|
||||
- select HAVE_CBPF_JIT if !64BIT && !CPU_MICROMIPS
|
||||
select HAVE_CONTEXT_TRACKING
|
||||
select HAVE_TIF_NOHZ
|
||||
select HAVE_C_RECORDMCOUNT
|
||||
@@ -65,7 +64,10 @@ config MIPS
|
||||
select HAVE_DEBUG_STACKOVERFLOW
|
||||
select HAVE_DMA_CONTIGUOUS
|
||||
select HAVE_DYNAMIC_FTRACE
|
||||
- select HAVE_EBPF_JIT if 64BIT && !CPU_MICROMIPS && TARGET_ISA_REV >= 2
|
||||
+ select HAVE_EBPF_JIT if !CPU_MICROMIPS && \
|
||||
+ !CPU_DADDI_WORKAROUNDS && \
|
||||
+ !CPU_R4000_WORKAROUNDS && \
|
||||
+ !CPU_R4400_WORKAROUNDS
|
||||
select HAVE_EXIT_THREAD
|
||||
select HAVE_FAST_GUP
|
||||
select HAVE_FTRACE_MCOUNT_RECORD
|
||||
--- a/arch/mips/net/Makefile
|
||||
+++ b/arch/mips/net/Makefile
|
||||
@@ -2,9 +2,10 @@
|
||||
# MIPS networking code
|
||||
|
||||
obj-$(CONFIG_MIPS_CBPF_JIT) += bpf_jit.o bpf_jit_asm.o
|
||||
+obj-$(CONFIG_MIPS_EBPF_JIT) += bpf_jit_comp.o
|
||||
|
||||
ifeq ($(CONFIG_32BIT),y)
|
||||
- obj-$(CONFIG_MIPS_EBPF_JIT) += bpf_jit_comp.o bpf_jit_comp32.o
|
||||
+ obj-$(CONFIG_MIPS_EBPF_JIT) += bpf_jit_comp32.o
|
||||
else
|
||||
- obj-$(CONFIG_MIPS_EBPF_JIT) += ebpf_jit.o
|
||||
+ obj-$(CONFIG_MIPS_EBPF_JIT) += bpf_jit_comp64.o
|
||||
endif
|
@ -0,0 +1,387 @@
|
||||
From: Johan Almbladh <johan.almbladh@anyfinetworks.com>
|
||||
Date: Tue, 5 Oct 2021 18:54:08 +0200
|
||||
Subject: [PATCH] mips: bpf: Remove old BPF JIT implementations
|
||||
|
||||
This patch removes the old 32-bit cBPF and 64-bit eBPF JIT implementations.
|
||||
They are replaced by a new eBPF implementation that supports both 32-bit
|
||||
and 64-bit MIPS CPUs.
|
||||
|
||||
Signed-off-by: Johan Almbladh <johan.almbladh@anyfinetworks.com>
|
||||
---
|
||||
delete mode 100644 arch/mips/net/bpf_jit.c
|
||||
delete mode 100644 arch/mips/net/bpf_jit.h
|
||||
delete mode 100644 arch/mips/net/bpf_jit_asm.S
|
||||
delete mode 100644 arch/mips/net/ebpf_jit.c
|
||||
|
||||
--- a/arch/mips/net/bpf_jit.h
|
||||
+++ /dev/null
|
||||
@@ -1,81 +0,0 @@
|
||||
-/* SPDX-License-Identifier: GPL-2.0-only */
|
||||
-/*
|
||||
- * Just-In-Time compiler for BPF filters on MIPS
|
||||
- *
|
||||
- * Copyright (c) 2014 Imagination Technologies Ltd.
|
||||
- * Author: Markos Chandras <markos.chandras@imgtec.com>
|
||||
- */
|
||||
-
|
||||
-#ifndef BPF_JIT_MIPS_OP_H
|
||||
-#define BPF_JIT_MIPS_OP_H
|
||||
-
|
||||
-/* Registers used by JIT */
|
||||
-#define MIPS_R_ZERO 0
|
||||
-#define MIPS_R_V0 2
|
||||
-#define MIPS_R_A0 4
|
||||
-#define MIPS_R_A1 5
|
||||
-#define MIPS_R_T4 12
|
||||
-#define MIPS_R_T5 13
|
||||
-#define MIPS_R_T6 14
|
||||
-#define MIPS_R_T7 15
|
||||
-#define MIPS_R_S0 16
|
||||
-#define MIPS_R_S1 17
|
||||
-#define MIPS_R_S2 18
|
||||
-#define MIPS_R_S3 19
|
||||
-#define MIPS_R_S4 20
|
||||
-#define MIPS_R_S5 21
|
||||
-#define MIPS_R_S6 22
|
||||
-#define MIPS_R_S7 23
|
||||
-#define MIPS_R_SP 29
|
||||
-#define MIPS_R_RA 31
|
||||
-
|
||||
-/* Conditional codes */
|
||||
-#define MIPS_COND_EQ 0x1
|
||||
-#define MIPS_COND_GE (0x1 << 1)
|
||||
-#define MIPS_COND_GT (0x1 << 2)
|
||||
-#define MIPS_COND_NE (0x1 << 3)
|
||||
-#define MIPS_COND_ALL (0x1 << 4)
|
||||
-/* Conditionals on X register or K immediate */
|
||||
-#define MIPS_COND_X (0x1 << 5)
|
||||
-#define MIPS_COND_K (0x1 << 6)
|
||||
-
|
||||
-#define r_ret MIPS_R_V0
|
||||
-
|
||||
-/*
|
||||
- * Use 2 scratch registers to avoid pipeline interlocks.
|
||||
- * There is no overhead during epilogue and prologue since
|
||||
- * any of the $s0-$s6 registers will only be preserved if
|
||||
- * they are going to actually be used.
|
||||
- */
|
||||
-#define r_skb_hl MIPS_R_S0 /* skb header length */
|
||||
-#define r_skb_data MIPS_R_S1 /* skb actual data */
|
||||
-#define r_off MIPS_R_S2
|
||||
-#define r_A MIPS_R_S3
|
||||
-#define r_X MIPS_R_S4
|
||||
-#define r_skb MIPS_R_S5
|
||||
-#define r_M MIPS_R_S6
|
||||
-#define r_skb_len MIPS_R_S7
|
||||
-#define r_s0 MIPS_R_T4 /* scratch reg 1 */
|
||||
-#define r_s1 MIPS_R_T5 /* scratch reg 2 */
|
||||
-#define r_tmp_imm MIPS_R_T6 /* No need to preserve this */
|
||||
-#define r_tmp MIPS_R_T7 /* No need to preserve this */
|
||||
-#define r_zero MIPS_R_ZERO
|
||||
-#define r_sp MIPS_R_SP
|
||||
-#define r_ra MIPS_R_RA
|
||||
-
|
||||
-#ifndef __ASSEMBLY__
|
||||
-
|
||||
-/* Declare ASM helpers */
|
||||
-
|
||||
-#define DECLARE_LOAD_FUNC(func) \
|
||||
- extern u8 func(unsigned long *skb, int offset); \
|
||||
- extern u8 func##_negative(unsigned long *skb, int offset); \
|
||||
- extern u8 func##_positive(unsigned long *skb, int offset)
|
||||
-
|
||||
-DECLARE_LOAD_FUNC(sk_load_word);
|
||||
-DECLARE_LOAD_FUNC(sk_load_half);
|
||||
-DECLARE_LOAD_FUNC(sk_load_byte);
|
||||
-
|
||||
-#endif
|
||||
-
|
||||
-#endif /* BPF_JIT_MIPS_OP_H */
|
||||
--- a/arch/mips/net/bpf_jit_asm.S
|
||||
+++ /dev/null
|
||||
@@ -1,285 +0,0 @@
|
||||
-/*
|
||||
- * bpf_jib_asm.S: Packet/header access helper functions for MIPS/MIPS64 BPF
|
||||
- * compiler.
|
||||
- *
|
||||
- * Copyright (C) 2015 Imagination Technologies Ltd.
|
||||
- * Author: Markos Chandras <markos.chandras@imgtec.com>
|
||||
- *
|
||||
- * This program is free software; you can redistribute it and/or modify it
|
||||
- * under the terms of the GNU General Public License as published by the
|
||||
- * Free Software Foundation; version 2 of the License.
|
||||
- */
|
||||
-
|
||||
-#include <asm/asm.h>
|
||||
-#include <asm/isa-rev.h>
|
||||
-#include <asm/regdef.h>
|
||||
-#include "bpf_jit.h"
|
||||
-
|
||||
-/* ABI
|
||||
- *
|
||||
- * r_skb_hl skb header length
|
||||
- * r_skb_data skb data
|
||||
- * r_off(a1) offset register
|
||||
- * r_A BPF register A
|
||||
- * r_X PF register X
|
||||
- * r_skb(a0) *skb
|
||||
- * r_M *scratch memory
|
||||
- * r_skb_le skb length
|
||||
- * r_s0 Scratch register 0
|
||||
- * r_s1 Scratch register 1
|
||||
- *
|
||||
- * On entry:
|
||||
- * a0: *skb
|
||||
- * a1: offset (imm or imm + X)
|
||||
- *
|
||||
- * All non-BPF-ABI registers are free for use. On return, we only
|
||||
- * care about r_ret. The BPF-ABI registers are assumed to remain
|
||||
- * unmodified during the entire filter operation.
|
||||
- */
|
||||
-
|
||||
-#define skb a0
|
||||
-#define offset a1
|
||||
-#define SKF_LL_OFF (-0x200000) /* Can't include linux/filter.h in assembly */
|
||||
-
|
||||
- /* We know better :) so prevent assembler reordering etc */
|
||||
- .set noreorder
|
||||
-
|
||||
-#define is_offset_negative(TYPE) \
|
||||
- /* If offset is negative we have more work to do */ \
|
||||
- slti t0, offset, 0; \
|
||||
- bgtz t0, bpf_slow_path_##TYPE##_neg; \
|
||||
- /* Be careful what follows in DS. */
|
||||
-
|
||||
-#define is_offset_in_header(SIZE, TYPE) \
|
||||
- /* Reading from header? */ \
|
||||
- addiu $r_s0, $r_skb_hl, -SIZE; \
|
||||
- slt t0, $r_s0, offset; \
|
||||
- bgtz t0, bpf_slow_path_##TYPE; \
|
||||
-
|
||||
-LEAF(sk_load_word)
|
||||
- is_offset_negative(word)
|
||||
-FEXPORT(sk_load_word_positive)
|
||||
- is_offset_in_header(4, word)
|
||||
- /* Offset within header boundaries */
|
||||
- PTR_ADDU t1, $r_skb_data, offset
|
||||
- .set reorder
|
||||
- lw $r_A, 0(t1)
|
||||
- .set noreorder
|
||||
-#ifdef CONFIG_CPU_LITTLE_ENDIAN
|
||||
-# if MIPS_ISA_REV >= 2
|
||||
- wsbh t0, $r_A
|
||||
- rotr $r_A, t0, 16
|
||||
-# else
|
||||
- sll t0, $r_A, 24
|
||||
- srl t1, $r_A, 24
|
||||
- srl t2, $r_A, 8
|
||||
- or t0, t0, t1
|
||||
- andi t2, t2, 0xff00
|
||||
- andi t1, $r_A, 0xff00
|
||||
- or t0, t0, t2
|
||||
- sll t1, t1, 8
|
||||
- or $r_A, t0, t1
|
||||
-# endif
|
||||
-#endif
|
||||
- jr $r_ra
|
||||
- move $r_ret, zero
|
||||
- END(sk_load_word)
|
||||
-
|
||||
-LEAF(sk_load_half)
|
||||
- is_offset_negative(half)
|
||||
-FEXPORT(sk_load_half_positive)
|
||||
- is_offset_in_header(2, half)
|
||||
- /* Offset within header boundaries */
|
||||
- PTR_ADDU t1, $r_skb_data, offset
|
||||
- lhu $r_A, 0(t1)
|
||||
-#ifdef CONFIG_CPU_LITTLE_ENDIAN
|
||||
-# if MIPS_ISA_REV >= 2
|
||||
- wsbh $r_A, $r_A
|
||||
-# else
|
||||
- sll t0, $r_A, 8
|
||||
- srl t1, $r_A, 8
|
||||
- andi t0, t0, 0xff00
|
||||
- or $r_A, t0, t1
|
||||
-# endif
|
||||
-#endif
|
||||
- jr $r_ra
|
||||
- move $r_ret, zero
|
||||
- END(sk_load_half)
|
||||
-
|
||||
-LEAF(sk_load_byte)
|
||||
- is_offset_negative(byte)
|
||||
-FEXPORT(sk_load_byte_positive)
|
||||
- is_offset_in_header(1, byte)
|
||||
- /* Offset within header boundaries */
|
||||
- PTR_ADDU t1, $r_skb_data, offset
|
||||
- lbu $r_A, 0(t1)
|
||||
- jr $r_ra
|
||||
- move $r_ret, zero
|
||||
- END(sk_load_byte)
|
||||
-
|
||||
-/*
|
||||
- * call skb_copy_bits:
|
||||
- * (prototype in linux/skbuff.h)
|
||||
- *
|
||||
- * int skb_copy_bits(sk_buff *skb, int offset, void *to, int len)
|
||||
- *
|
||||
- * o32 mandates we leave 4 spaces for argument registers in case
|
||||
- * the callee needs to use them. Even though we don't care about
|
||||
- * the argument registers ourselves, we need to allocate that space
|
||||
- * to remain ABI compliant since the callee may want to use that space.
|
||||
- * We also allocate 2 more spaces for $r_ra and our return register (*to).
|
||||
- *
|
||||
- * n64 is a bit different. The *caller* will allocate the space to preserve
|
||||
- * the arguments. So in 64-bit kernels, we allocate the 4-arg space for no
|
||||
- * good reason but it does not matter that much really.
|
||||
- *
|
||||
- * (void *to) is returned in r_s0
|
||||
- *
|
||||
- */
|
||||
-#ifdef CONFIG_CPU_LITTLE_ENDIAN
|
||||
-#define DS_OFFSET(SIZE) (4 * SZREG)
|
||||
-#else
|
||||
-#define DS_OFFSET(SIZE) ((4 * SZREG) + (4 - SIZE))
|
||||
-#endif
|
||||
-#define bpf_slow_path_common(SIZE) \
|
||||
- /* Quick check. Are we within reasonable boundaries? */ \
|
||||
- LONG_ADDIU $r_s1, $r_skb_len, -SIZE; \
|
||||
- sltu $r_s0, offset, $r_s1; \
|
||||
- beqz $r_s0, fault; \
|
||||
- /* Load 4th argument in DS */ \
|
||||
- LONG_ADDIU a3, zero, SIZE; \
|
||||
- PTR_ADDIU $r_sp, $r_sp, -(6 * SZREG); \
|
||||
- PTR_LA t0, skb_copy_bits; \
|
||||
- PTR_S $r_ra, (5 * SZREG)($r_sp); \
|
||||
- /* Assign low slot to a2 */ \
|
||||
- PTR_ADDIU a2, $r_sp, DS_OFFSET(SIZE); \
|
||||
- jalr t0; \
|
||||
- /* Reset our destination slot (DS but it's ok) */ \
|
||||
- INT_S zero, (4 * SZREG)($r_sp); \
|
||||
- /* \
|
||||
- * skb_copy_bits returns 0 on success and -EFAULT \
|
||||
- * on error. Our data live in a2. Do not bother with \
|
||||
- * our data if an error has been returned. \
|
||||
- */ \
|
||||
- /* Restore our frame */ \
|
||||
- PTR_L $r_ra, (5 * SZREG)($r_sp); \
|
||||
- INT_L $r_s0, (4 * SZREG)($r_sp); \
|
||||
- bltz v0, fault; \
|
||||
- PTR_ADDIU $r_sp, $r_sp, 6 * SZREG; \
|
||||
- move $r_ret, zero; \
|
||||
-
|
||||
-NESTED(bpf_slow_path_word, (6 * SZREG), $r_sp)
|
||||
- bpf_slow_path_common(4)
|
||||
-#ifdef CONFIG_CPU_LITTLE_ENDIAN
|
||||
-# if MIPS_ISA_REV >= 2
|
||||
- wsbh t0, $r_s0
|
||||
- jr $r_ra
|
||||
- rotr $r_A, t0, 16
|
||||
-# else
|
||||
- sll t0, $r_s0, 24
|
||||
- srl t1, $r_s0, 24
|
||||
- srl t2, $r_s0, 8
|
||||
- or t0, t0, t1
|
||||
- andi t2, t2, 0xff00
|
||||
- andi t1, $r_s0, 0xff00
|
||||
- or t0, t0, t2
|
||||
- sll t1, t1, 8
|
||||
- jr $r_ra
|
||||
- or $r_A, t0, t1
|
||||
-# endif
|
||||
-#else
|
||||
- jr $r_ra
|
||||
- move $r_A, $r_s0
|
||||
-#endif
|
||||
-
|
||||
- END(bpf_slow_path_word)
|
||||
-
|
||||
-NESTED(bpf_slow_path_half, (6 * SZREG), $r_sp)
|
||||
- bpf_slow_path_common(2)
|
||||
-#ifdef CONFIG_CPU_LITTLE_ENDIAN
|
||||
-# if MIPS_ISA_REV >= 2
|
||||
- jr $r_ra
|
||||
- wsbh $r_A, $r_s0
|
||||
-# else
|
||||
- sll t0, $r_s0, 8
|
||||
- andi t1, $r_s0, 0xff00
|
||||
- andi t0, t0, 0xff00
|
||||
- srl t1, t1, 8
|
||||
- jr $r_ra
|
||||
- or $r_A, t0, t1
|
||||
-# endif
|
||||
-#else
|
||||
- jr $r_ra
|
||||
- move $r_A, $r_s0
|
||||
-#endif
|
||||
-
|
||||
- END(bpf_slow_path_half)
|
||||
-
|
||||
-NESTED(bpf_slow_path_byte, (6 * SZREG), $r_sp)
|
||||
- bpf_slow_path_common(1)
|
||||
- jr $r_ra
|
||||
- move $r_A, $r_s0
|
||||
-
|
||||
- END(bpf_slow_path_byte)
|
||||
-
|
||||
-/*
|
||||
- * Negative entry points
|
||||
- */
|
||||
- .macro bpf_is_end_of_data
|
||||
- li t0, SKF_LL_OFF
|
||||
- /* Reading link layer data? */
|
||||
- slt t1, offset, t0
|
||||
- bgtz t1, fault
|
||||
- /* Be careful what follows in DS. */
|
||||
- .endm
|
||||
-/*
|
||||
- * call skb_copy_bits:
|
||||
- * (prototype in linux/filter.h)
|
||||
- *
|
||||
- * void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb,
|
||||
- * int k, unsigned int size)
|
||||
- *
|
||||
- * see above (bpf_slow_path_common) for ABI restrictions
|
||||
- */
|
||||
-#define bpf_negative_common(SIZE) \
|
||||
- PTR_ADDIU $r_sp, $r_sp, -(6 * SZREG); \
|
||||
- PTR_LA t0, bpf_internal_load_pointer_neg_helper; \
|
||||
- PTR_S $r_ra, (5 * SZREG)($r_sp); \
|
||||
- jalr t0; \
|
||||
- li a2, SIZE; \
|
||||
- PTR_L $r_ra, (5 * SZREG)($r_sp); \
|
||||
- /* Check return pointer */ \
|
||||
- beqz v0, fault; \
|
||||
- PTR_ADDIU $r_sp, $r_sp, 6 * SZREG; \
|
||||
- /* Preserve our pointer */ \
|
||||
- move $r_s0, v0; \
|
||||
- /* Set return value */ \
|
||||
- move $r_ret, zero; \
|
||||
-
|
||||
-bpf_slow_path_word_neg:
|
||||
- bpf_is_end_of_data
|
||||
-NESTED(sk_load_word_negative, (6 * SZREG), $r_sp)
|
||||
- bpf_negative_common(4)
|
||||
- jr $r_ra
|
||||
- lw $r_A, 0($r_s0)
|
||||
- END(sk_load_word_negative)
|
||||
-
|
||||
-bpf_slow_path_half_neg:
|
||||
- bpf_is_end_of_data
|
||||
-NESTED(sk_load_half_negative, (6 * SZREG), $r_sp)
|
||||
- bpf_negative_common(2)
|
||||
- jr $r_ra
|
||||
- lhu $r_A, 0($r_s0)
|
||||
- END(sk_load_half_negative)
|
||||
-
|
||||
-bpf_slow_path_byte_neg:
|
||||
- bpf_is_end_of_data
|
||||
-NESTED(sk_load_byte_negative, (6 * SZREG), $r_sp)
|
||||
- bpf_negative_common(1)
|
||||
- jr $r_ra
|
||||
- lbu $r_A, 0($r_s0)
|
||||
- END(sk_load_byte_negative)
|
||||
-
|
||||
-fault:
|
||||
- jr $r_ra
|
||||
- addiu $r_ret, zero, 1
|
@ -0,0 +1,105 @@
|
||||
From 815f0e738a8d5663a02350e2580706829144a722 Mon Sep 17 00:00:00 2001
|
||||
From: Horatiu Vultur <horatiu.vultur@microchip.com>
|
||||
Date: Wed, 3 Nov 2021 09:50:59 +0100
|
||||
Subject: [PATCH] clk: gate: Add devm_clk_hw_register_gate()
|
||||
|
||||
Add devm_clk_hw_register_gate() - devres-managed version of
|
||||
clk_hw_register_gate()
|
||||
|
||||
Suggested-by: Stephen Boyd <sboyd@kernel.org>
|
||||
Signed-off-by: Horatiu Vultur <horatiu.vultur@microchip.com>
|
||||
Acked-by: Nicolas Ferre <nicolas.ferre@microchip.com>
|
||||
Signed-off-by: Nicolas Ferre <nicolas.ferre@microchip.com>
|
||||
Link: https://lore.kernel.org/r/20211103085102.1656081-2-horatiu.vultur@microchip.com
|
||||
---
|
||||
drivers/clk/clk-gate.c | 35 +++++++++++++++++++++++++++++++++++
|
||||
include/linux/clk-provider.h | 23 +++++++++++++++++++++++
|
||||
2 files changed, 58 insertions(+)
|
||||
|
||||
--- a/drivers/clk/clk-gate.c
|
||||
+++ b/drivers/clk/clk-gate.c
|
||||
@@ -7,6 +7,7 @@
|
||||
*/
|
||||
|
||||
#include <linux/clk-provider.h>
|
||||
+#include <linux/device.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/io.h>
|
||||
@@ -222,3 +223,37 @@ void clk_hw_unregister_gate(struct clk_h
|
||||
kfree(gate);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(clk_hw_unregister_gate);
|
||||
+
|
||||
+static void devm_clk_hw_release_gate(struct device *dev, void *res)
|
||||
+{
|
||||
+ clk_hw_unregister_gate(*(struct clk_hw **)res);
|
||||
+}
|
||||
+
|
||||
+struct clk_hw *__devm_clk_hw_register_gate(struct device *dev,
|
||||
+ struct device_node *np, const char *name,
|
||||
+ const char *parent_name, const struct clk_hw *parent_hw,
|
||||
+ const struct clk_parent_data *parent_data,
|
||||
+ unsigned long flags,
|
||||
+ void __iomem *reg, u8 bit_idx,
|
||||
+ u8 clk_gate_flags, spinlock_t *lock)
|
||||
+{
|
||||
+ struct clk_hw **ptr, *hw;
|
||||
+
|
||||
+ ptr = devres_alloc(devm_clk_hw_release_gate, sizeof(*ptr), GFP_KERNEL);
|
||||
+ if (!ptr)
|
||||
+ return ERR_PTR(-ENOMEM);
|
||||
+
|
||||
+ hw = __clk_hw_register_gate(dev, np, name, parent_name, parent_hw,
|
||||
+ parent_data, flags, reg, bit_idx,
|
||||
+ clk_gate_flags, lock);
|
||||
+
|
||||
+ if (!IS_ERR(hw)) {
|
||||
+ *ptr = hw;
|
||||
+ devres_add(dev, ptr);
|
||||
+ } else {
|
||||
+ devres_free(ptr);
|
||||
+ }
|
||||
+
|
||||
+ return hw;
|
||||
+}
|
||||
+EXPORT_SYMBOL_GPL(__devm_clk_hw_register_gate);
|
||||
--- a/include/linux/clk-provider.h
|
||||
+++ b/include/linux/clk-provider.h
|
||||
@@ -490,6 +490,13 @@ struct clk_hw *__clk_hw_register_gate(st
|
||||
unsigned long flags,
|
||||
void __iomem *reg, u8 bit_idx,
|
||||
u8 clk_gate_flags, spinlock_t *lock);
|
||||
+struct clk_hw *__devm_clk_hw_register_gate(struct device *dev,
|
||||
+ struct device_node *np, const char *name,
|
||||
+ const char *parent_name, const struct clk_hw *parent_hw,
|
||||
+ const struct clk_parent_data *parent_data,
|
||||
+ unsigned long flags,
|
||||
+ void __iomem *reg, u8 bit_idx,
|
||||
+ u8 clk_gate_flags, spinlock_t *lock);
|
||||
struct clk *clk_register_gate(struct device *dev, const char *name,
|
||||
const char *parent_name, unsigned long flags,
|
||||
void __iomem *reg, u8 bit_idx,
|
||||
@@ -544,6 +551,22 @@ struct clk *clk_register_gate(struct dev
|
||||
__clk_hw_register_gate((dev), NULL, (name), NULL, NULL, (parent_data), \
|
||||
(flags), (reg), (bit_idx), \
|
||||
(clk_gate_flags), (lock))
|
||||
+/**
|
||||
+ * devm_clk_hw_register_gate - register a gate clock with the clock framework
|
||||
+ * @dev: device that is registering this clock
|
||||
+ * @name: name of this clock
|
||||
+ * @parent_name: name of this clock's parent
|
||||
+ * @flags: framework-specific flags for this clock
|
||||
+ * @reg: register address to control gating of this clock
|
||||
+ * @bit_idx: which bit in the register controls gating of this clock
|
||||
+ * @clk_gate_flags: gate-specific flags for this clock
|
||||
+ * @lock: shared register lock for this clock
|
||||
+ */
|
||||
+#define devm_clk_hw_register_gate(dev, name, parent_name, flags, reg, bit_idx,\
|
||||
+ clk_gate_flags, lock) \
|
||||
+ __devm_clk_hw_register_gate((dev), NULL, (name), (parent_name), NULL, \
|
||||
+ NULL, (flags), (reg), (bit_idx), \
|
||||
+ (clk_gate_flags), (lock))
|
||||
void clk_unregister_gate(struct clk *clk);
|
||||
void clk_hw_unregister_gate(struct clk_hw *hw);
|
||||
int clk_gate_is_enabled(struct clk_hw *hw);
|
@ -0,0 +1,52 @@
|
||||
From 02d6fdecb9c38de19065f6bed8d5214556fd061d Mon Sep 17 00:00:00 2001
|
||||
From: Ansuel Smith <ansuelsmth@gmail.com>
|
||||
Date: Thu, 4 Nov 2021 16:00:40 +0100
|
||||
Subject: regmap: allow to define reg_update_bits for no bus configuration
|
||||
|
||||
Some device requires a special handling for reg_update_bits and can't use
|
||||
the normal regmap read write logic. An example is when locking is
|
||||
handled by the device and rmw operations requires to do atomic operations.
|
||||
Allow to declare a dedicated function in regmap_config for
|
||||
reg_update_bits in no bus configuration.
|
||||
|
||||
Signed-off-by: Ansuel Smith <ansuelsmth@gmail.com>
|
||||
Link: https://lore.kernel.org/r/20211104150040.1260-1-ansuelsmth@gmail.com
|
||||
Signed-off-by: Mark Brown <broonie@kernel.org>
|
||||
---
|
||||
drivers/base/regmap/regmap.c | 1 +
|
||||
include/linux/regmap.h | 7 +++++++
|
||||
2 files changed, 8 insertions(+)
|
||||
|
||||
--- a/drivers/base/regmap/regmap.c
|
||||
+++ b/drivers/base/regmap/regmap.c
|
||||
@@ -877,6 +877,7 @@ struct regmap *__regmap_init(struct devi
|
||||
if (!bus) {
|
||||
map->reg_read = config->reg_read;
|
||||
map->reg_write = config->reg_write;
|
||||
+ map->reg_update_bits = config->reg_update_bits;
|
||||
|
||||
map->defer_caching = false;
|
||||
goto skip_format_initialization;
|
||||
--- a/include/linux/regmap.h
|
||||
+++ b/include/linux/regmap.h
|
||||
@@ -290,6 +290,11 @@ typedef void (*regmap_unlock)(void *);
|
||||
* read operation on a bus such as SPI, I2C, etc. Most of the
|
||||
* devices do not need this.
|
||||
* @reg_write: Same as above for writing.
|
||||
+ * @reg_update_bits: Optional callback that if filled will be used to perform
|
||||
+ * all the update_bits(rmw) operation. Should only be provided
|
||||
+ * if the function require special handling with lock and reg
|
||||
+ * handling and the operation cannot be represented as a simple
|
||||
+ * update_bits operation on a bus such as SPI, I2C, etc.
|
||||
* @fast_io: Register IO is fast. Use a spinlock instead of a mutex
|
||||
* to perform locking. This field is ignored if custom lock/unlock
|
||||
* functions are used (see fields lock/unlock of struct regmap_config).
|
||||
@@ -372,6 +377,8 @@ struct regmap_config {
|
||||
|
||||
int (*reg_read)(void *context, unsigned int reg, unsigned int *val);
|
||||
int (*reg_write)(void *context, unsigned int reg, unsigned int val);
|
||||
+ int (*reg_update_bits)(void *context, unsigned int reg,
|
||||
+ unsigned int mask, unsigned int val);
|
||||
|
||||
bool fast_io;
|
||||
|
@ -0,0 +1,37 @@
|
||||
From 0dc0da881b4574d1e04a079ab2ea75da61f5ad2e Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= <rafal@milecki.pl>
|
||||
Date: Fri, 11 Mar 2022 10:32:33 +0100
|
||||
Subject: [PATCH] tty: serial: bcm63xx: use more precise Kconfig symbol
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
Patches lowering SERIAL_BCM63XX dependencies led to a discussion and
|
||||
documentation change regarding "depends" usage. Adjust Kconfig entry to
|
||||
match current guidelines. Make this symbol available for relevant
|
||||
architectures only.
|
||||
|
||||
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
|
||||
Reviewed-by: Geert Uytterhoeven <geert+renesas@glider.be>
|
||||
Acked-by: Florian Fainelli <f.fainelli@gmail.com>
|
||||
Signed-off-by: Rafał Miłecki <rafal@milecki.pl>
|
||||
Ref: f35a07f92616 ("tty: serial: bcm63xx: lower driver dependencies")
|
||||
Ref: 18084e435ff6 ("Documentation/kbuild: Document platform dependency practises")
|
||||
Link: https://lore.kernel.org/r/20220311093233.10012-1-zajec5@gmail.com
|
||||
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
|
||||
---
|
||||
drivers/tty/serial/Kconfig | 3 ++-
|
||||
1 file changed, 2 insertions(+), 1 deletion(-)
|
||||
|
||||
--- a/drivers/tty/serial/Kconfig
|
||||
+++ b/drivers/tty/serial/Kconfig
|
||||
@@ -1098,7 +1098,8 @@ config SERIAL_TIMBERDALE
|
||||
config SERIAL_BCM63XX
|
||||
tristate "Broadcom BCM63xx/BCM33xx UART support"
|
||||
select SERIAL_CORE
|
||||
- depends on COMMON_CLK
|
||||
+ depends on ARCH_BCM4908 || ARCH_BCM_63XX || BCM63XX || BMIPS_GENERIC || COMPILE_TEST
|
||||
+ default ARCH_BCM4908 || ARCH_BCM_63XX || BCM63XX || BMIPS_GENERIC
|
||||
help
|
||||
This enables the driver for the onchip UART core found on
|
||||
the following chipsets:
|
@ -0,0 +1,49 @@
|
||||
From cdbc4e3399ed8cdcf234a85f7a2482b622379e82 Mon Sep 17 00:00:00 2001
|
||||
From: Connor O'Brien <connoro@google.com>
|
||||
Date: Wed, 12 Jan 2022 00:25:03 +0000
|
||||
Subject: [PATCH] tools/resolve_btfids: Build with host flags
|
||||
|
||||
resolve_btfids is built using $(HOSTCC) and $(HOSTLD) but does not
|
||||
pick up the corresponding flags. As a result, host-specific settings
|
||||
(such as a sysroot specified via HOSTCFLAGS=--sysroot=..., or a linker
|
||||
specified via HOSTLDFLAGS=-fuse-ld=...) will not be respected.
|
||||
|
||||
Fix this by setting CFLAGS to KBUILD_HOSTCFLAGS and LDFLAGS to
|
||||
KBUILD_HOSTLDFLAGS.
|
||||
|
||||
Also pass the cflags through to libbpf via EXTRA_CFLAGS to ensure that
|
||||
the host libbpf is built with flags consistent with resolve_btfids.
|
||||
|
||||
Signed-off-by: Connor O'Brien <connoro@google.com>
|
||||
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
|
||||
Acked-by: Song Liu <songliubraving@fb.com>
|
||||
Link: https://lore.kernel.org/bpf/20220112002503.115968-1-connoro@google.com
|
||||
(cherry picked from commit 0e3a1c902ffb56e9fe4416f0cd382c97b09ecbf6)
|
||||
Signed-off-by: Stijn Tintel <stijn@linux-ipv6.be>
|
||||
---
|
||||
tools/bpf/resolve_btfids/Makefile | 6 ++++--
|
||||
1 file changed, 4 insertions(+), 2 deletions(-)
|
||||
|
||||
--- a/tools/bpf/resolve_btfids/Makefile
|
||||
+++ b/tools/bpf/resolve_btfids/Makefile
|
||||
@@ -23,6 +23,8 @@ CC = $(HOSTCC)
|
||||
LD = $(HOSTLD)
|
||||
ARCH = $(HOSTARCH)
|
||||
RM ?= rm
|
||||
+CFLAGS := $(KBUILD_HOSTCFLAGS)
|
||||
+LDFLAGS := $(KBUILD_HOSTLDFLAGS)
|
||||
|
||||
OUTPUT ?= $(srctree)/tools/bpf/resolve_btfids/
|
||||
|
||||
@@ -45,9 +47,9 @@ $(SUBCMDOBJ): fixdep FORCE | $(OUTPUT)/l
|
||||
$(Q)$(MAKE) -C $(SUBCMD_SRC) OUTPUT=$(abspath $(dir $@))/ $(abspath $@)
|
||||
|
||||
$(BPFOBJ): $(wildcard $(LIBBPF_SRC)/*.[ch] $(LIBBPF_SRC)/Makefile) | $(OUTPUT)/libbpf
|
||||
- $(Q)$(MAKE) $(submake_extras) -C $(LIBBPF_SRC) OUTPUT=$(abspath $(dir $@))/ $(abspath $@)
|
||||
+ $(Q)$(MAKE) $(submake_extras) -C $(LIBBPF_SRC) OUTPUT=$(abspath $(dir $@))/ EXTRA_CFLAGS="$(CFLAGS)" $(abspath $@)
|
||||
|
||||
-CFLAGS := -g \
|
||||
+CFLAGS += -g \
|
||||
-I$(srctree)/tools/include \
|
||||
-I$(srctree)/tools/include/uapi \
|
||||
-I$(LIBBPF_SRC) \
|
@ -0,0 +1,997 @@
|
||||
From a77725a9a3c5924e2fd4cd5b3557dd92a8e46f87 Mon Sep 17 00:00:00 2001
|
||||
From: Rob Herring <robh@kernel.org>
|
||||
Date: Mon, 25 Oct 2021 11:05:45 -0500
|
||||
Subject: [PATCH 1/1] scripts/dtc: Update to upstream version
|
||||
v1.6.1-19-g0a3a9d3449c8
|
||||
|
||||
This adds the following commits from upstream:
|
||||
|
||||
0a3a9d3449c8 checks: Add an interrupt-map check
|
||||
8fd24744e361 checks: Ensure '#interrupt-cells' only exists in interrupt providers
|
||||
d8d1a9a77863 checks: Drop interrupt provider '#address-cells' check
|
||||
52a16fd72824 checks: Make interrupt_provider check dependent on interrupts_extended_is_cell
|
||||
37fd700685da treesource: Maintain phandle label/path on output
|
||||
e33ce1d6a8c7 flattree: Use '\n', not ';' to separate asm pseudo-ops
|
||||
d24cc189dca6 asm: Use assembler macros instead of cpp macros
|
||||
ff3a30c115ad asm: Use .asciz and .ascii instead of .string
|
||||
5eb5927d81ee fdtdump: fix -Werror=int-to-pointer-cast
|
||||
0869f8269161 libfdt: Add ALIGNMENT error string
|
||||
69595a167f06 checks: Fix bus-range check
|
||||
72d09e2682a4 Makefile: add -Wsign-compare to warning options
|
||||
b587787ef388 checks: Fix signedness comparisons warnings
|
||||
69bed6c2418f dtc: Wrap phandle validity check
|
||||
910221185560 fdtget: Fix signedness comparisons warnings
|
||||
d966f08fcd21 tests: Fix signedness comparisons warnings
|
||||
ecfb438c07fa dtc: Fix signedness comparisons warnings: pointer diff
|
||||
5bec74a6d135 dtc: Fix signedness comparisons warnings: reservednum
|
||||
24e7f511fd4a fdtdump: Fix signedness comparisons warnings
|
||||
b6910bec1161 Bump version to v1.6.1
|
||||
21d61d18f968 Fix CID 1461557
|
||||
4c2ef8f4d14c checks: Introduce is_multiple_of()
|
||||
e59ca36fb70e Make handling of cpp line information more tolerant
|
||||
0c3fd9b6aceb checks: Drop interrupt_cells_is_cell check
|
||||
6b3081abc4ac checks: Add check_is_cell() for all phandle+arg properties
|
||||
2dffc192a77f yamltree: Remove marker ordering dependency
|
||||
61e513439e40 pylibfdt: Rework "avoid unused variable warning" lines
|
||||
c8bddd106095 tests: add a positive gpio test case
|
||||
ad4abfadb687 checks: replace strstr and strrchr with strends
|
||||
09c6a6e88718 dtc.h: add strends for suffix matching
|
||||
9bb9b8d0b4a0 checks: tigthen up nr-gpios prop exception
|
||||
b07b62ee3342 libfdt: Add FDT alignment check to fdt_check_header()
|
||||
a2def5479950 libfdt: Check that the root-node name is empty
|
||||
4ca61f84dc21 libfdt: Check that there is only one root node
|
||||
34d708249a91 dtc: Remove -O dtbo support
|
||||
8e7ff260f755 libfdt: Fix a possible "unchecked return value" warning
|
||||
88875268c05c checks: Warn on node-name and property name being the same
|
||||
9d2279e7e6ee checks: Change node-name check to match devicetree spec
|
||||
f527c867a8c6 util: limit gnu_printf format attribute to gcc >= 4.4.0
|
||||
|
||||
Reviewed-by: Frank Rowand <frank.rowand@sony.com>
|
||||
Tested-by: Frank Rowand <frank.rowand@sony.com>
|
||||
Signed-off-by: Rob Herring <robh@kernel.org>
|
||||
---
|
||||
scripts/dtc/checks.c | 222 ++++++++++++++++++++++--------
|
||||
scripts/dtc/dtc-lexer.l | 2 +-
|
||||
scripts/dtc/dtc.c | 6 +-
|
||||
scripts/dtc/dtc.h | 40 +++++-
|
||||
scripts/dtc/flattree.c | 11 +-
|
||||
scripts/dtc/libfdt/fdt.c | 4 +
|
||||
scripts/dtc/libfdt/fdt_rw.c | 18 ++-
|
||||
scripts/dtc/libfdt/fdt_strerror.c | 1 +
|
||||
scripts/dtc/libfdt/libfdt.h | 7 +
|
||||
scripts/dtc/livetree.c | 6 +-
|
||||
scripts/dtc/treesource.c | 48 +++----
|
||||
scripts/dtc/util.h | 6 +-
|
||||
scripts/dtc/version_gen.h | 2 +-
|
||||
scripts/dtc/yamltree.c | 16 ++-
|
||||
14 files changed, 275 insertions(+), 114 deletions(-)
|
||||
|
||||
--- a/scripts/dtc/checks.c
|
||||
+++ b/scripts/dtc/checks.c
|
||||
@@ -143,6 +143,14 @@ static void check_nodes_props(struct che
|
||||
check_nodes_props(c, dti, child);
|
||||
}
|
||||
|
||||
+static bool is_multiple_of(int multiple, int divisor)
|
||||
+{
|
||||
+ if (divisor == 0)
|
||||
+ return multiple == 0;
|
||||
+ else
|
||||
+ return (multiple % divisor) == 0;
|
||||
+}
|
||||
+
|
||||
static bool run_check(struct check *c, struct dt_info *dti)
|
||||
{
|
||||
struct node *dt = dti->dt;
|
||||
@@ -297,19 +305,20 @@ ERROR(duplicate_property_names, check_du
|
||||
#define LOWERCASE "abcdefghijklmnopqrstuvwxyz"
|
||||
#define UPPERCASE "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
|
||||
#define DIGITS "0123456789"
|
||||
-#define PROPNODECHARS LOWERCASE UPPERCASE DIGITS ",._+*#?-"
|
||||
+#define NODECHARS LOWERCASE UPPERCASE DIGITS ",._+-@"
|
||||
+#define PROPCHARS LOWERCASE UPPERCASE DIGITS ",._+*#?-"
|
||||
#define PROPNODECHARSSTRICT LOWERCASE UPPERCASE DIGITS ",-"
|
||||
|
||||
static void check_node_name_chars(struct check *c, struct dt_info *dti,
|
||||
struct node *node)
|
||||
{
|
||||
- int n = strspn(node->name, c->data);
|
||||
+ size_t n = strspn(node->name, c->data);
|
||||
|
||||
if (n < strlen(node->name))
|
||||
FAIL(c, dti, node, "Bad character '%c' in node name",
|
||||
node->name[n]);
|
||||
}
|
||||
-ERROR(node_name_chars, check_node_name_chars, PROPNODECHARS "@");
|
||||
+ERROR(node_name_chars, check_node_name_chars, NODECHARS);
|
||||
|
||||
static void check_node_name_chars_strict(struct check *c, struct dt_info *dti,
|
||||
struct node *node)
|
||||
@@ -330,6 +339,20 @@ static void check_node_name_format(struc
|
||||
}
|
||||
ERROR(node_name_format, check_node_name_format, NULL, &node_name_chars);
|
||||
|
||||
+static void check_node_name_vs_property_name(struct check *c,
|
||||
+ struct dt_info *dti,
|
||||
+ struct node *node)
|
||||
+{
|
||||
+ if (!node->parent)
|
||||
+ return;
|
||||
+
|
||||
+ if (get_property(node->parent, node->name)) {
|
||||
+ FAIL(c, dti, node, "node name and property name conflict");
|
||||
+ }
|
||||
+}
|
||||
+WARNING(node_name_vs_property_name, check_node_name_vs_property_name,
|
||||
+ NULL, &node_name_chars);
|
||||
+
|
||||
static void check_unit_address_vs_reg(struct check *c, struct dt_info *dti,
|
||||
struct node *node)
|
||||
{
|
||||
@@ -363,14 +386,14 @@ static void check_property_name_chars(st
|
||||
struct property *prop;
|
||||
|
||||
for_each_property(node, prop) {
|
||||
- int n = strspn(prop->name, c->data);
|
||||
+ size_t n = strspn(prop->name, c->data);
|
||||
|
||||
if (n < strlen(prop->name))
|
||||
FAIL_PROP(c, dti, node, prop, "Bad character '%c' in property name",
|
||||
prop->name[n]);
|
||||
}
|
||||
}
|
||||
-ERROR(property_name_chars, check_property_name_chars, PROPNODECHARS);
|
||||
+ERROR(property_name_chars, check_property_name_chars, PROPCHARS);
|
||||
|
||||
static void check_property_name_chars_strict(struct check *c,
|
||||
struct dt_info *dti,
|
||||
@@ -380,7 +403,7 @@ static void check_property_name_chars_st
|
||||
|
||||
for_each_property(node, prop) {
|
||||
const char *name = prop->name;
|
||||
- int n = strspn(name, c->data);
|
||||
+ size_t n = strspn(name, c->data);
|
||||
|
||||
if (n == strlen(prop->name))
|
||||
continue;
|
||||
@@ -497,7 +520,7 @@ static cell_t check_phandle_prop(struct
|
||||
|
||||
phandle = propval_cell(prop);
|
||||
|
||||
- if ((phandle == 0) || (phandle == -1)) {
|
||||
+ if (!phandle_is_valid(phandle)) {
|
||||
FAIL_PROP(c, dti, node, prop, "bad value (0x%x) in %s property",
|
||||
phandle, prop->name);
|
||||
return 0;
|
||||
@@ -556,7 +579,7 @@ static void check_name_properties(struct
|
||||
if (!prop)
|
||||
return; /* No name property, that's fine */
|
||||
|
||||
- if ((prop->val.len != node->basenamelen+1)
|
||||
+ if ((prop->val.len != node->basenamelen + 1U)
|
||||
|| (memcmp(prop->val.val, node->name, node->basenamelen) != 0)) {
|
||||
FAIL(c, dti, node, "\"name\" property is incorrect (\"%s\" instead"
|
||||
" of base node name)", prop->val.val);
|
||||
@@ -657,7 +680,6 @@ ERROR(omit_unused_nodes, fixup_omit_unus
|
||||
*/
|
||||
WARNING_IF_NOT_CELL(address_cells_is_cell, "#address-cells");
|
||||
WARNING_IF_NOT_CELL(size_cells_is_cell, "#size-cells");
|
||||
-WARNING_IF_NOT_CELL(interrupt_cells_is_cell, "#interrupt-cells");
|
||||
|
||||
WARNING_IF_NOT_STRING(device_type_is_string, "device_type");
|
||||
WARNING_IF_NOT_STRING(model_is_string, "model");
|
||||
@@ -672,8 +694,7 @@ static void check_names_is_string_list(s
|
||||
struct property *prop;
|
||||
|
||||
for_each_property(node, prop) {
|
||||
- const char *s = strrchr(prop->name, '-');
|
||||
- if (!s || !streq(s, "-names"))
|
||||
+ if (!strends(prop->name, "-names"))
|
||||
continue;
|
||||
|
||||
c->data = prop->name;
|
||||
@@ -753,7 +774,7 @@ static void check_reg_format(struct chec
|
||||
size_cells = node_size_cells(node->parent);
|
||||
entrylen = (addr_cells + size_cells) * sizeof(cell_t);
|
||||
|
||||
- if (!entrylen || (prop->val.len % entrylen) != 0)
|
||||
+ if (!is_multiple_of(prop->val.len, entrylen))
|
||||
FAIL_PROP(c, dti, node, prop, "property has invalid length (%d bytes) "
|
||||
"(#address-cells == %d, #size-cells == %d)",
|
||||
prop->val.len, addr_cells, size_cells);
|
||||
@@ -794,7 +815,7 @@ static void check_ranges_format(struct c
|
||||
"#size-cells (%d) differs from %s (%d)",
|
||||
ranges, c_size_cells, node->parent->fullpath,
|
||||
p_size_cells);
|
||||
- } else if ((prop->val.len % entrylen) != 0) {
|
||||
+ } else if (!is_multiple_of(prop->val.len, entrylen)) {
|
||||
FAIL_PROP(c, dti, node, prop, "\"%s\" property has invalid length (%d bytes) "
|
||||
"(parent #address-cells == %d, child #address-cells == %d, "
|
||||
"#size-cells == %d)", ranges, prop->val.len,
|
||||
@@ -871,7 +892,7 @@ static void check_pci_device_bus_num(str
|
||||
} else {
|
||||
cells = (cell_t *)prop->val.val;
|
||||
min_bus = fdt32_to_cpu(cells[0]);
|
||||
- max_bus = fdt32_to_cpu(cells[0]);
|
||||
+ max_bus = fdt32_to_cpu(cells[1]);
|
||||
}
|
||||
if ((bus_num < min_bus) || (bus_num > max_bus))
|
||||
FAIL_PROP(c, dti, node, prop, "PCI bus number %d out of range, expected (%d - %d)",
|
||||
@@ -1367,9 +1388,9 @@ static void check_property_phandle_args(
|
||||
const struct provider *provider)
|
||||
{
|
||||
struct node *root = dti->dt;
|
||||
- int cell, cellsize = 0;
|
||||
+ unsigned int cell, cellsize = 0;
|
||||
|
||||
- if (prop->val.len % sizeof(cell_t)) {
|
||||
+ if (!is_multiple_of(prop->val.len, sizeof(cell_t))) {
|
||||
FAIL_PROP(c, dti, node, prop,
|
||||
"property size (%d) is invalid, expected multiple of %zu",
|
||||
prop->val.len, sizeof(cell_t));
|
||||
@@ -1379,14 +1400,14 @@ static void check_property_phandle_args(
|
||||
for (cell = 0; cell < prop->val.len / sizeof(cell_t); cell += cellsize + 1) {
|
||||
struct node *provider_node;
|
||||
struct property *cellprop;
|
||||
- int phandle;
|
||||
+ cell_t phandle;
|
||||
|
||||
phandle = propval_cell_n(prop, cell);
|
||||
/*
|
||||
* Some bindings use a cell value 0 or -1 to skip over optional
|
||||
* entries when each index position has a specific definition.
|
||||
*/
|
||||
- if (phandle == 0 || phandle == -1) {
|
||||
+ if (!phandle_is_valid(phandle)) {
|
||||
/* Give up if this is an overlay with external references */
|
||||
if (dti->dtsflags & DTSF_PLUGIN)
|
||||
break;
|
||||
@@ -1452,7 +1473,8 @@ static void check_provider_cells_propert
|
||||
}
|
||||
#define WARNING_PROPERTY_PHANDLE_CELLS(nm, propname, cells_name, ...) \
|
||||
static struct provider nm##_provider = { (propname), (cells_name), __VA_ARGS__ }; \
|
||||
- WARNING(nm##_property, check_provider_cells_property, &nm##_provider, &phandle_references);
|
||||
+ WARNING_IF_NOT_CELL(nm##_is_cell, cells_name); \
|
||||
+ WARNING(nm##_property, check_provider_cells_property, &nm##_provider, &nm##_is_cell, &phandle_references);
|
||||
|
||||
WARNING_PROPERTY_PHANDLE_CELLS(clocks, "clocks", "#clock-cells");
|
||||
WARNING_PROPERTY_PHANDLE_CELLS(cooling_device, "cooling-device", "#cooling-cells");
|
||||
@@ -1473,24 +1495,17 @@ WARNING_PROPERTY_PHANDLE_CELLS(thermal_s
|
||||
|
||||
static bool prop_is_gpio(struct property *prop)
|
||||
{
|
||||
- char *str;
|
||||
-
|
||||
/*
|
||||
* *-gpios and *-gpio can appear in property names,
|
||||
* so skip over any false matches (only one known ATM)
|
||||
*/
|
||||
- if (strstr(prop->name, "nr-gpio"))
|
||||
+ if (strends(prop->name, ",nr-gpios"))
|
||||
return false;
|
||||
|
||||
- str = strrchr(prop->name, '-');
|
||||
- if (str)
|
||||
- str++;
|
||||
- else
|
||||
- str = prop->name;
|
||||
- if (!(streq(str, "gpios") || streq(str, "gpio")))
|
||||
- return false;
|
||||
-
|
||||
- return true;
|
||||
+ return strends(prop->name, "-gpios") ||
|
||||
+ streq(prop->name, "gpios") ||
|
||||
+ strends(prop->name, "-gpio") ||
|
||||
+ streq(prop->name, "gpio");
|
||||
}
|
||||
|
||||
static void check_gpios_property(struct check *c,
|
||||
@@ -1525,13 +1540,10 @@ static void check_deprecated_gpio_proper
|
||||
struct property *prop;
|
||||
|
||||
for_each_property(node, prop) {
|
||||
- char *str;
|
||||
-
|
||||
if (!prop_is_gpio(prop))
|
||||
continue;
|
||||
|
||||
- str = strstr(prop->name, "gpio");
|
||||
- if (!streq(str, "gpio"))
|
||||
+ if (!strends(prop->name, "gpio"))
|
||||
continue;
|
||||
|
||||
FAIL_PROP(c, dti, node, prop,
|
||||
@@ -1561,21 +1573,106 @@ static void check_interrupt_provider(str
|
||||
struct node *node)
|
||||
{
|
||||
struct property *prop;
|
||||
+ bool irq_provider = node_is_interrupt_provider(node);
|
||||
|
||||
- if (!node_is_interrupt_provider(node))
|
||||
+ prop = get_property(node, "#interrupt-cells");
|
||||
+ if (irq_provider && !prop) {
|
||||
+ FAIL(c, dti, node,
|
||||
+ "Missing '#interrupt-cells' in interrupt provider");
|
||||
return;
|
||||
+ }
|
||||
|
||||
- prop = get_property(node, "#interrupt-cells");
|
||||
- if (!prop)
|
||||
+ if (!irq_provider && prop) {
|
||||
FAIL(c, dti, node,
|
||||
- "Missing #interrupt-cells in interrupt provider");
|
||||
+ "'#interrupt-cells' found, but node is not an interrupt provider");
|
||||
+ return;
|
||||
+ }
|
||||
+}
|
||||
+WARNING(interrupt_provider, check_interrupt_provider, NULL, &interrupts_extended_is_cell);
|
||||
|
||||
- prop = get_property(node, "#address-cells");
|
||||
- if (!prop)
|
||||
+static void check_interrupt_map(struct check *c,
|
||||
+ struct dt_info *dti,
|
||||
+ struct node *node)
|
||||
+{
|
||||
+ struct node *root = dti->dt;
|
||||
+ struct property *prop, *irq_map_prop;
|
||||
+ size_t cellsize, cell, map_cells;
|
||||
+
|
||||
+ irq_map_prop = get_property(node, "interrupt-map");
|
||||
+ if (!irq_map_prop)
|
||||
+ return;
|
||||
+
|
||||
+ if (node->addr_cells < 0) {
|
||||
FAIL(c, dti, node,
|
||||
- "Missing #address-cells in interrupt provider");
|
||||
+ "Missing '#address-cells' in interrupt-map provider");
|
||||
+ return;
|
||||
+ }
|
||||
+ cellsize = node_addr_cells(node);
|
||||
+ cellsize += propval_cell(get_property(node, "#interrupt-cells"));
|
||||
+
|
||||
+ prop = get_property(node, "interrupt-map-mask");
|
||||
+ if (prop && (prop->val.len != (cellsize * sizeof(cell_t))))
|
||||
+ FAIL_PROP(c, dti, node, prop,
|
||||
+ "property size (%d) is invalid, expected %zu",
|
||||
+ prop->val.len, cellsize * sizeof(cell_t));
|
||||
+
|
||||
+ if (!is_multiple_of(irq_map_prop->val.len, sizeof(cell_t))) {
|
||||
+ FAIL_PROP(c, dti, node, irq_map_prop,
|
||||
+ "property size (%d) is invalid, expected multiple of %zu",
|
||||
+ irq_map_prop->val.len, sizeof(cell_t));
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
+ map_cells = irq_map_prop->val.len / sizeof(cell_t);
|
||||
+ for (cell = 0; cell < map_cells; ) {
|
||||
+ struct node *provider_node;
|
||||
+ struct property *cellprop;
|
||||
+ int phandle;
|
||||
+ size_t parent_cellsize;
|
||||
+
|
||||
+ if ((cell + cellsize) >= map_cells) {
|
||||
+ FAIL_PROP(c, dti, node, irq_map_prop,
|
||||
+ "property size (%d) too small, expected > %zu",
|
||||
+ irq_map_prop->val.len, (cell + cellsize) * sizeof(cell_t));
|
||||
+ break;
|
||||
+ }
|
||||
+ cell += cellsize;
|
||||
+
|
||||
+ phandle = propval_cell_n(irq_map_prop, cell);
|
||||
+ if (!phandle_is_valid(phandle)) {
|
||||
+ /* Give up if this is an overlay with external references */
|
||||
+ if (!(dti->dtsflags & DTSF_PLUGIN))
|
||||
+ FAIL_PROP(c, dti, node, irq_map_prop,
|
||||
+ "Cell %zu is not a phandle(%d)",
|
||||
+ cell, phandle);
|
||||
+ break;
|
||||
+ }
|
||||
+
|
||||
+ provider_node = get_node_by_phandle(root, phandle);
|
||||
+ if (!provider_node) {
|
||||
+ FAIL_PROP(c, dti, node, irq_map_prop,
|
||||
+ "Could not get phandle(%d) node for (cell %zu)",
|
||||
+ phandle, cell);
|
||||
+ break;
|
||||
+ }
|
||||
+
|
||||
+ cellprop = get_property(provider_node, "#interrupt-cells");
|
||||
+ if (cellprop) {
|
||||
+ parent_cellsize = propval_cell(cellprop);
|
||||
+ } else {
|
||||
+ FAIL(c, dti, node, "Missing property '#interrupt-cells' in node %s or bad phandle (referred from interrupt-map[%zu])",
|
||||
+ provider_node->fullpath, cell);
|
||||
+ break;
|
||||
+ }
|
||||
+
|
||||
+ cellprop = get_property(provider_node, "#address-cells");
|
||||
+ if (cellprop)
|
||||
+ parent_cellsize += propval_cell(cellprop);
|
||||
+
|
||||
+ cell += 1 + parent_cellsize;
|
||||
+ }
|
||||
}
|
||||
-WARNING(interrupt_provider, check_interrupt_provider, NULL);
|
||||
+WARNING(interrupt_map, check_interrupt_map, NULL, &phandle_references, &addr_size_cells, &interrupt_provider);
|
||||
|
||||
static void check_interrupts_property(struct check *c,
|
||||
struct dt_info *dti,
|
||||
@@ -1584,13 +1681,13 @@ static void check_interrupts_property(st
|
||||
struct node *root = dti->dt;
|
||||
struct node *irq_node = NULL, *parent = node;
|
||||
struct property *irq_prop, *prop = NULL;
|
||||
- int irq_cells, phandle;
|
||||
+ cell_t irq_cells, phandle;
|
||||
|
||||
irq_prop = get_property(node, "interrupts");
|
||||
if (!irq_prop)
|
||||
return;
|
||||
|
||||
- if (irq_prop->val.len % sizeof(cell_t))
|
||||
+ if (!is_multiple_of(irq_prop->val.len, sizeof(cell_t)))
|
||||
FAIL_PROP(c, dti, node, irq_prop, "size (%d) is invalid, expected multiple of %zu",
|
||||
irq_prop->val.len, sizeof(cell_t));
|
||||
|
||||
@@ -1603,7 +1700,7 @@ static void check_interrupts_property(st
|
||||
prop = get_property(parent, "interrupt-parent");
|
||||
if (prop) {
|
||||
phandle = propval_cell(prop);
|
||||
- if ((phandle == 0) || (phandle == -1)) {
|
||||
+ if (!phandle_is_valid(phandle)) {
|
||||
/* Give up if this is an overlay with
|
||||
* external references */
|
||||
if (dti->dtsflags & DTSF_PLUGIN)
|
||||
@@ -1639,7 +1736,7 @@ static void check_interrupts_property(st
|
||||
}
|
||||
|
||||
irq_cells = propval_cell(prop);
|
||||
- if (irq_prop->val.len % (irq_cells * sizeof(cell_t))) {
|
||||
+ if (!is_multiple_of(irq_prop->val.len, irq_cells * sizeof(cell_t))) {
|
||||
FAIL_PROP(c, dti, node, prop,
|
||||
"size is (%d), expected multiple of %d",
|
||||
irq_prop->val.len, (int)(irq_cells * sizeof(cell_t)));
|
||||
@@ -1750,7 +1847,7 @@ WARNING(graph_port, check_graph_port, NU
|
||||
static struct node *get_remote_endpoint(struct check *c, struct dt_info *dti,
|
||||
struct node *endpoint)
|
||||
{
|
||||
- int phandle;
|
||||
+ cell_t phandle;
|
||||
struct node *node;
|
||||
struct property *prop;
|
||||
|
||||
@@ -1760,7 +1857,7 @@ static struct node *get_remote_endpoint(
|
||||
|
||||
phandle = propval_cell(prop);
|
||||
/* Give up if this is an overlay with external references */
|
||||
- if (phandle == 0 || phandle == -1)
|
||||
+ if (!phandle_is_valid(phandle))
|
||||
return NULL;
|
||||
|
||||
node = get_node_by_phandle(dti->dt, phandle);
|
||||
@@ -1796,7 +1893,7 @@ WARNING(graph_endpoint, check_graph_endp
|
||||
static struct check *check_table[] = {
|
||||
&duplicate_node_names, &duplicate_property_names,
|
||||
&node_name_chars, &node_name_format, &property_name_chars,
|
||||
- &name_is_string, &name_properties,
|
||||
+ &name_is_string, &name_properties, &node_name_vs_property_name,
|
||||
|
||||
&duplicate_label,
|
||||
|
||||
@@ -1804,7 +1901,7 @@ static struct check *check_table[] = {
|
||||
&phandle_references, &path_references,
|
||||
&omit_unused_nodes,
|
||||
|
||||
- &address_cells_is_cell, &size_cells_is_cell, &interrupt_cells_is_cell,
|
||||
+ &address_cells_is_cell, &size_cells_is_cell,
|
||||
&device_type_is_string, &model_is_string, &status_is_string,
|
||||
&label_is_string,
|
||||
|
||||
@@ -1839,26 +1936,43 @@ static struct check *check_table[] = {
|
||||
&chosen_node_is_root, &chosen_node_bootargs, &chosen_node_stdout_path,
|
||||
|
||||
&clocks_property,
|
||||
+ &clocks_is_cell,
|
||||
&cooling_device_property,
|
||||
+ &cooling_device_is_cell,
|
||||
&dmas_property,
|
||||
+ &dmas_is_cell,
|
||||
&hwlocks_property,
|
||||
+ &hwlocks_is_cell,
|
||||
&interrupts_extended_property,
|
||||
+ &interrupts_extended_is_cell,
|
||||
&io_channels_property,
|
||||
+ &io_channels_is_cell,
|
||||
&iommus_property,
|
||||
+ &iommus_is_cell,
|
||||
&mboxes_property,
|
||||
+ &mboxes_is_cell,
|
||||
&msi_parent_property,
|
||||
+ &msi_parent_is_cell,
|
||||
&mux_controls_property,
|
||||
+ &mux_controls_is_cell,
|
||||
&phys_property,
|
||||
+ &phys_is_cell,
|
||||
&power_domains_property,
|
||||
+ &power_domains_is_cell,
|
||||
&pwms_property,
|
||||
+ &pwms_is_cell,
|
||||
&resets_property,
|
||||
+ &resets_is_cell,
|
||||
&sound_dai_property,
|
||||
+ &sound_dai_is_cell,
|
||||
&thermal_sensors_property,
|
||||
+ &thermal_sensors_is_cell,
|
||||
|
||||
&deprecated_gpio_property,
|
||||
&gpios_property,
|
||||
&interrupts_property,
|
||||
&interrupt_provider,
|
||||
+ &interrupt_map,
|
||||
|
||||
&alias_paths,
|
||||
|
||||
@@ -1882,7 +1996,7 @@ static void enable_warning_error(struct
|
||||
|
||||
static void disable_warning_error(struct check *c, bool warn, bool error)
|
||||
{
|
||||
- int i;
|
||||
+ unsigned int i;
|
||||
|
||||
/* Lowering level, also lower it for things this is the prereq
|
||||
* for */
|
||||
@@ -1903,7 +2017,7 @@ static void disable_warning_error(struct
|
||||
|
||||
void parse_checks_option(bool warn, bool error, const char *arg)
|
||||
{
|
||||
- int i;
|
||||
+ unsigned int i;
|
||||
const char *name = arg;
|
||||
bool enable = true;
|
||||
|
||||
@@ -1930,7 +2044,7 @@ void parse_checks_option(bool warn, bool
|
||||
|
||||
void process_checks(bool force, struct dt_info *dti)
|
||||
{
|
||||
- int i;
|
||||
+ unsigned int i;
|
||||
int error = 0;
|
||||
|
||||
for (i = 0; i < ARRAY_SIZE(check_table); i++) {
|
||||
--- a/scripts/dtc/dtc-lexer.l
|
||||
+++ b/scripts/dtc/dtc-lexer.l
|
||||
@@ -57,7 +57,7 @@ static void PRINTF(1, 2) lexical_error(c
|
||||
push_input_file(name);
|
||||
}
|
||||
|
||||
-<*>^"#"(line)?[ \t]+[0-9]+[ \t]+{STRING}([ \t]+[0-9]+)? {
|
||||
+<*>^"#"(line)?[ \t]+[0-9]+[ \t]+{STRING}([ \t]+[0-9]+)* {
|
||||
char *line, *fnstart, *fnend;
|
||||
struct data fn;
|
||||
/* skip text before line # */
|
||||
--- a/scripts/dtc/dtc.c
|
||||
+++ b/scripts/dtc/dtc.c
|
||||
@@ -12,7 +12,7 @@
|
||||
* Command line options
|
||||
*/
|
||||
int quiet; /* Level of quietness */
|
||||
-int reservenum; /* Number of memory reservation slots */
|
||||
+unsigned int reservenum;/* Number of memory reservation slots */
|
||||
int minsize; /* Minimum blob size */
|
||||
int padsize; /* Additional padding to blob */
|
||||
int alignsize; /* Additional padding to blob accroding to the alignsize */
|
||||
@@ -197,7 +197,7 @@ int main(int argc, char *argv[])
|
||||
depname = optarg;
|
||||
break;
|
||||
case 'R':
|
||||
- reservenum = strtol(optarg, NULL, 0);
|
||||
+ reservenum = strtoul(optarg, NULL, 0);
|
||||
break;
|
||||
case 'S':
|
||||
minsize = strtol(optarg, NULL, 0);
|
||||
@@ -359,8 +359,6 @@ int main(int argc, char *argv[])
|
||||
#endif
|
||||
} else if (streq(outform, "dtb")) {
|
||||
dt_to_blob(outf, dti, outversion);
|
||||
- } else if (streq(outform, "dtbo")) {
|
||||
- dt_to_blob(outf, dti, outversion);
|
||||
} else if (streq(outform, "asm")) {
|
||||
dt_to_asm(outf, dti, outversion);
|
||||
} else if (streq(outform, "null")) {
|
||||
--- a/scripts/dtc/dtc.h
|
||||
+++ b/scripts/dtc/dtc.h
|
||||
@@ -35,7 +35,7 @@
|
||||
* Command line options
|
||||
*/
|
||||
extern int quiet; /* Level of quietness */
|
||||
-extern int reservenum; /* Number of memory reservation slots */
|
||||
+extern unsigned int reservenum; /* Number of memory reservation slots */
|
||||
extern int minsize; /* Minimum blob size */
|
||||
extern int padsize; /* Additional padding to blob */
|
||||
extern int alignsize; /* Additional padding to blob accroding to the alignsize */
|
||||
@@ -51,6 +51,11 @@ extern int annotate; /* annotate .dts w
|
||||
|
||||
typedef uint32_t cell_t;
|
||||
|
||||
+static inline bool phandle_is_valid(cell_t phandle)
|
||||
+{
|
||||
+ return phandle != 0 && phandle != ~0U;
|
||||
+}
|
||||
+
|
||||
static inline uint16_t dtb_ld16(const void *p)
|
||||
{
|
||||
const uint8_t *bp = (const uint8_t *)p;
|
||||
@@ -86,6 +91,16 @@ static inline uint64_t dtb_ld64(const vo
|
||||
#define streq(a, b) (strcmp((a), (b)) == 0)
|
||||
#define strstarts(s, prefix) (strncmp((s), (prefix), strlen(prefix)) == 0)
|
||||
#define strprefixeq(a, n, b) (strlen(b) == (n) && (memcmp(a, b, n) == 0))
|
||||
+static inline bool strends(const char *str, const char *suffix)
|
||||
+{
|
||||
+ unsigned int len, suffix_len;
|
||||
+
|
||||
+ len = strlen(str);
|
||||
+ suffix_len = strlen(suffix);
|
||||
+ if (len < suffix_len)
|
||||
+ return false;
|
||||
+ return streq(str + len - suffix_len, suffix);
|
||||
+}
|
||||
|
||||
#define ALIGN(x, a) (((x) + (a) - 1) & ~((a) - 1))
|
||||
|
||||
@@ -101,6 +116,12 @@ enum markertype {
|
||||
TYPE_UINT64,
|
||||
TYPE_STRING,
|
||||
};
|
||||
+
|
||||
+static inline bool is_type_marker(enum markertype type)
|
||||
+{
|
||||
+ return type >= TYPE_UINT8;
|
||||
+}
|
||||
+
|
||||
extern const char *markername(enum markertype markertype);
|
||||
|
||||
struct marker {
|
||||
@@ -125,7 +146,22 @@ struct data {
|
||||
for_each_marker(m) \
|
||||
if ((m)->type == (t))
|
||||
|
||||
-size_t type_marker_length(struct marker *m);
|
||||
+static inline struct marker *next_type_marker(struct marker *m)
|
||||
+{
|
||||
+ for_each_marker(m)
|
||||
+ if (is_type_marker(m->type))
|
||||
+ break;
|
||||
+ return m;
|
||||
+}
|
||||
+
|
||||
+static inline size_t type_marker_length(struct marker *m)
|
||||
+{
|
||||
+ struct marker *next = next_type_marker(m->next);
|
||||
+
|
||||
+ if (next)
|
||||
+ return next->offset - m->offset;
|
||||
+ return 0;
|
||||
+}
|
||||
|
||||
void data_free(struct data d);
|
||||
|
||||
--- a/scripts/dtc/flattree.c
|
||||
+++ b/scripts/dtc/flattree.c
|
||||
@@ -124,7 +124,8 @@ static void asm_emit_cell(void *e, cell_
|
||||
{
|
||||
FILE *f = e;
|
||||
|
||||
- fprintf(f, "\t.byte 0x%02x; .byte 0x%02x; .byte 0x%02x; .byte 0x%02x\n",
|
||||
+ fprintf(f, "\t.byte\t0x%02x\n" "\t.byte\t0x%02x\n"
|
||||
+ "\t.byte\t0x%02x\n" "\t.byte\t0x%02x\n",
|
||||
(val >> 24) & 0xff, (val >> 16) & 0xff,
|
||||
(val >> 8) & 0xff, val & 0xff);
|
||||
}
|
||||
@@ -134,9 +135,9 @@ static void asm_emit_string(void *e, con
|
||||
FILE *f = e;
|
||||
|
||||
if (len != 0)
|
||||
- fprintf(f, "\t.string\t\"%.*s\"\n", len, str);
|
||||
+ fprintf(f, "\t.asciz\t\"%.*s\"\n", len, str);
|
||||
else
|
||||
- fprintf(f, "\t.string\t\"%s\"\n", str);
|
||||
+ fprintf(f, "\t.asciz\t\"%s\"\n", str);
|
||||
}
|
||||
|
||||
static void asm_emit_align(void *e, int a)
|
||||
@@ -295,7 +296,7 @@ static struct data flatten_reserve_list(
|
||||
{
|
||||
struct reserve_info *re;
|
||||
struct data d = empty_data;
|
||||
- int j;
|
||||
+ unsigned int j;
|
||||
|
||||
for (re = reservelist; re; re = re->next) {
|
||||
d = data_append_re(d, re->address, re->size);
|
||||
@@ -438,7 +439,7 @@ static void dump_stringtable_asm(FILE *f
|
||||
|
||||
while (p < (strbuf.val + strbuf.len)) {
|
||||
len = strlen(p);
|
||||
- fprintf(f, "\t.string \"%s\"\n", p);
|
||||
+ fprintf(f, "\t.asciz \"%s\"\n", p);
|
||||
p += len+1;
|
||||
}
|
||||
}
|
||||
--- a/scripts/dtc/libfdt/fdt.c
|
||||
+++ b/scripts/dtc/libfdt/fdt.c
|
||||
@@ -90,6 +90,10 @@ int fdt_check_header(const void *fdt)
|
||||
{
|
||||
size_t hdrsize;
|
||||
|
||||
+ /* The device tree must be at an 8-byte aligned address */
|
||||
+ if ((uintptr_t)fdt & 7)
|
||||
+ return -FDT_ERR_ALIGNMENT;
|
||||
+
|
||||
if (fdt_magic(fdt) != FDT_MAGIC)
|
||||
return -FDT_ERR_BADMAGIC;
|
||||
if (!can_assume(LATEST)) {
|
||||
--- a/scripts/dtc/libfdt/fdt_rw.c
|
||||
+++ b/scripts/dtc/libfdt/fdt_rw.c
|
||||
@@ -349,7 +349,10 @@ int fdt_add_subnode_namelen(void *fdt, i
|
||||
return offset;
|
||||
|
||||
/* Try to place the new node after the parent's properties */
|
||||
- fdt_next_tag(fdt, parentoffset, &nextoffset); /* skip the BEGIN_NODE */
|
||||
+ tag = fdt_next_tag(fdt, parentoffset, &nextoffset);
|
||||
+ /* the fdt_subnode_offset_namelen() should ensure this never hits */
|
||||
+ if (!can_assume(LIBFDT_FLAWLESS) && (tag != FDT_BEGIN_NODE))
|
||||
+ return -FDT_ERR_INTERNAL;
|
||||
do {
|
||||
offset = nextoffset;
|
||||
tag = fdt_next_tag(fdt, offset, &nextoffset);
|
||||
@@ -391,7 +394,9 @@ int fdt_del_node(void *fdt, int nodeoffs
|
||||
}
|
||||
|
||||
static void fdt_packblocks_(const char *old, char *new,
|
||||
- int mem_rsv_size, int struct_size)
|
||||
+ int mem_rsv_size,
|
||||
+ int struct_size,
|
||||
+ int strings_size)
|
||||
{
|
||||
int mem_rsv_off, struct_off, strings_off;
|
||||
|
||||
@@ -406,8 +411,7 @@ static void fdt_packblocks_(const char *
|
||||
fdt_set_off_dt_struct(new, struct_off);
|
||||
fdt_set_size_dt_struct(new, struct_size);
|
||||
|
||||
- memmove(new + strings_off, old + fdt_off_dt_strings(old),
|
||||
- fdt_size_dt_strings(old));
|
||||
+ memmove(new + strings_off, old + fdt_off_dt_strings(old), strings_size);
|
||||
fdt_set_off_dt_strings(new, strings_off);
|
||||
fdt_set_size_dt_strings(new, fdt_size_dt_strings(old));
|
||||
}
|
||||
@@ -467,7 +471,8 @@ int fdt_open_into(const void *fdt, void
|
||||
return -FDT_ERR_NOSPACE;
|
||||
}
|
||||
|
||||
- fdt_packblocks_(fdt, tmp, mem_rsv_size, struct_size);
|
||||
+ fdt_packblocks_(fdt, tmp, mem_rsv_size, struct_size,
|
||||
+ fdt_size_dt_strings(fdt));
|
||||
memmove(buf, tmp, newsize);
|
||||
|
||||
fdt_set_magic(buf, FDT_MAGIC);
|
||||
@@ -487,7 +492,8 @@ int fdt_pack(void *fdt)
|
||||
|
||||
mem_rsv_size = (fdt_num_mem_rsv(fdt)+1)
|
||||
* sizeof(struct fdt_reserve_entry);
|
||||
- fdt_packblocks_(fdt, fdt, mem_rsv_size, fdt_size_dt_struct(fdt));
|
||||
+ fdt_packblocks_(fdt, fdt, mem_rsv_size, fdt_size_dt_struct(fdt),
|
||||
+ fdt_size_dt_strings(fdt));
|
||||
fdt_set_totalsize(fdt, fdt_data_size_(fdt));
|
||||
|
||||
return 0;
|
||||
--- a/scripts/dtc/libfdt/fdt_strerror.c
|
||||
+++ b/scripts/dtc/libfdt/fdt_strerror.c
|
||||
@@ -39,6 +39,7 @@ static struct fdt_errtabent fdt_errtable
|
||||
FDT_ERRTABENT(FDT_ERR_BADOVERLAY),
|
||||
FDT_ERRTABENT(FDT_ERR_NOPHANDLES),
|
||||
FDT_ERRTABENT(FDT_ERR_BADFLAGS),
|
||||
+ FDT_ERRTABENT(FDT_ERR_ALIGNMENT),
|
||||
};
|
||||
#define FDT_ERRTABSIZE ((int)(sizeof(fdt_errtable) / sizeof(fdt_errtable[0])))
|
||||
|
||||
--- a/scripts/dtc/libfdt/libfdt.h
|
||||
+++ b/scripts/dtc/libfdt/libfdt.h
|
||||
@@ -131,6 +131,13 @@ uint32_t fdt_next_tag(const void *fdt, i
|
||||
* to work even with unaligned pointers on platforms (such as ARMv5) that don't
|
||||
* like unaligned loads and stores.
|
||||
*/
|
||||
+static inline uint16_t fdt16_ld(const fdt16_t *p)
|
||||
+{
|
||||
+ const uint8_t *bp = (const uint8_t *)p;
|
||||
+
|
||||
+ return ((uint16_t)bp[0] << 8) | bp[1];
|
||||
+}
|
||||
+
|
||||
static inline uint32_t fdt32_ld(const fdt32_t *p)
|
||||
{
|
||||
const uint8_t *bp = (const uint8_t *)p;
|
||||
--- a/scripts/dtc/livetree.c
|
||||
+++ b/scripts/dtc/livetree.c
|
||||
@@ -526,7 +526,7 @@ struct node *get_node_by_path(struct nod
|
||||
p = strchr(path, '/');
|
||||
|
||||
for_each_child(tree, child) {
|
||||
- if (p && strprefixeq(path, p - path, child->name))
|
||||
+ if (p && strprefixeq(path, (size_t)(p - path), child->name))
|
||||
return get_node_by_path(child, p+1);
|
||||
else if (!p && streq(path, child->name))
|
||||
return child;
|
||||
@@ -559,7 +559,7 @@ struct node *get_node_by_phandle(struct
|
||||
{
|
||||
struct node *child, *node;
|
||||
|
||||
- if ((phandle == 0) || (phandle == -1)) {
|
||||
+ if (!phandle_is_valid(phandle)) {
|
||||
assert(generate_fixups);
|
||||
return NULL;
|
||||
}
|
||||
@@ -594,7 +594,7 @@ cell_t get_node_phandle(struct node *roo
|
||||
static cell_t phandle = 1; /* FIXME: ick, static local */
|
||||
struct data d = empty_data;
|
||||
|
||||
- if ((node->phandle != 0) && (node->phandle != -1))
|
||||
+ if (phandle_is_valid(node->phandle))
|
||||
return node->phandle;
|
||||
|
||||
while (get_node_by_phandle(root, phandle))
|
||||
--- a/scripts/dtc/treesource.c
|
||||
+++ b/scripts/dtc/treesource.c
|
||||
@@ -124,27 +124,6 @@ static void write_propval_int(FILE *f, c
|
||||
}
|
||||
}
|
||||
|
||||
-static bool has_data_type_information(struct marker *m)
|
||||
-{
|
||||
- return m->type >= TYPE_UINT8;
|
||||
-}
|
||||
-
|
||||
-static struct marker *next_type_marker(struct marker *m)
|
||||
-{
|
||||
- while (m && !has_data_type_information(m))
|
||||
- m = m->next;
|
||||
- return m;
|
||||
-}
|
||||
-
|
||||
-size_t type_marker_length(struct marker *m)
|
||||
-{
|
||||
- struct marker *next = next_type_marker(m->next);
|
||||
-
|
||||
- if (next)
|
||||
- return next->offset - m->offset;
|
||||
- return 0;
|
||||
-}
|
||||
-
|
||||
static const char *delim_start[] = {
|
||||
[TYPE_UINT8] = "[",
|
||||
[TYPE_UINT16] = "/bits/ 16 <",
|
||||
@@ -229,26 +208,39 @@ static void write_propval(FILE *f, struc
|
||||
size_t chunk_len = (m->next ? m->next->offset : len) - m->offset;
|
||||
size_t data_len = type_marker_length(m) ? : len - m->offset;
|
||||
const char *p = &prop->val.val[m->offset];
|
||||
+ struct marker *m_phandle;
|
||||
|
||||
- if (has_data_type_information(m)) {
|
||||
+ if (is_type_marker(m->type)) {
|
||||
emit_type = m->type;
|
||||
fprintf(f, " %s", delim_start[emit_type]);
|
||||
} else if (m->type == LABEL)
|
||||
fprintf(f, " %s:", m->ref);
|
||||
- else if (m->offset)
|
||||
- fputc(' ', f);
|
||||
|
||||
- if (emit_type == TYPE_NONE) {
|
||||
- assert(chunk_len == 0);
|
||||
+ if (emit_type == TYPE_NONE || chunk_len == 0)
|
||||
continue;
|
||||
- }
|
||||
|
||||
switch(emit_type) {
|
||||
case TYPE_UINT16:
|
||||
write_propval_int(f, p, chunk_len, 2);
|
||||
break;
|
||||
case TYPE_UINT32:
|
||||
- write_propval_int(f, p, chunk_len, 4);
|
||||
+ m_phandle = prop->val.markers;
|
||||
+ for_each_marker_of_type(m_phandle, REF_PHANDLE)
|
||||
+ if (m->offset == m_phandle->offset)
|
||||
+ break;
|
||||
+
|
||||
+ if (m_phandle) {
|
||||
+ if (m_phandle->ref[0] == '/')
|
||||
+ fprintf(f, "&{%s}", m_phandle->ref);
|
||||
+ else
|
||||
+ fprintf(f, "&%s", m_phandle->ref);
|
||||
+ if (chunk_len > 4) {
|
||||
+ fputc(' ', f);
|
||||
+ write_propval_int(f, p + 4, chunk_len - 4, 4);
|
||||
+ }
|
||||
+ } else {
|
||||
+ write_propval_int(f, p, chunk_len, 4);
|
||||
+ }
|
||||
break;
|
||||
case TYPE_UINT64:
|
||||
write_propval_int(f, p, chunk_len, 8);
|
||||
--- a/scripts/dtc/util.h
|
||||
+++ b/scripts/dtc/util.h
|
||||
@@ -13,10 +13,10 @@
|
||||
*/
|
||||
|
||||
#ifdef __GNUC__
|
||||
-#ifdef __clang__
|
||||
-#define PRINTF(i, j) __attribute__((format (printf, i, j)))
|
||||
-#else
|
||||
+#if __GNUC__ >= 5 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 4)
|
||||
#define PRINTF(i, j) __attribute__((format (gnu_printf, i, j)))
|
||||
+#else
|
||||
+#define PRINTF(i, j) __attribute__((format (printf, i, j)))
|
||||
#endif
|
||||
#define NORETURN __attribute__((noreturn))
|
||||
#else
|
||||
--- a/scripts/dtc/version_gen.h
|
||||
+++ b/scripts/dtc/version_gen.h
|
||||
@@ -1 +1 @@
|
||||
-#define DTC_VERSION "DTC 1.6.0-g183df9e9"
|
||||
+#define DTC_VERSION "DTC 1.6.1-g0a3a9d34"
|
||||
--- a/scripts/dtc/yamltree.c
|
||||
+++ b/scripts/dtc/yamltree.c
|
||||
@@ -29,11 +29,12 @@ char *yaml_error_name[] = {
|
||||
(emitter)->problem, __func__, __LINE__); \
|
||||
})
|
||||
|
||||
-static void yaml_propval_int(yaml_emitter_t *emitter, struct marker *markers, char *data, unsigned int len, int width)
|
||||
+static void yaml_propval_int(yaml_emitter_t *emitter, struct marker *markers,
|
||||
+ char *data, unsigned int seq_offset, unsigned int len, int width)
|
||||
{
|
||||
yaml_event_t event;
|
||||
void *tag;
|
||||
- unsigned int off, start_offset = markers->offset;
|
||||
+ unsigned int off;
|
||||
|
||||
switch(width) {
|
||||
case 1: tag = "!u8"; break;
|
||||
@@ -66,7 +67,7 @@ static void yaml_propval_int(yaml_emitte
|
||||
m = markers;
|
||||
is_phandle = false;
|
||||
for_each_marker_of_type(m, REF_PHANDLE) {
|
||||
- if (m->offset == (start_offset + off)) {
|
||||
+ if (m->offset == (seq_offset + off)) {
|
||||
is_phandle = true;
|
||||
break;
|
||||
}
|
||||
@@ -114,6 +115,7 @@ static void yaml_propval(yaml_emitter_t
|
||||
yaml_event_t event;
|
||||
unsigned int len = prop->val.len;
|
||||
struct marker *m = prop->val.markers;
|
||||
+ struct marker *markers = prop->val.markers;
|
||||
|
||||
/* Emit the property name */
|
||||
yaml_scalar_event_initialize(&event, NULL,
|
||||
@@ -151,19 +153,19 @@ static void yaml_propval(yaml_emitter_t
|
||||
|
||||
switch(m->type) {
|
||||
case TYPE_UINT16:
|
||||
- yaml_propval_int(emitter, m, data, chunk_len, 2);
|
||||
+ yaml_propval_int(emitter, markers, data, m->offset, chunk_len, 2);
|
||||
break;
|
||||
case TYPE_UINT32:
|
||||
- yaml_propval_int(emitter, m, data, chunk_len, 4);
|
||||
+ yaml_propval_int(emitter, markers, data, m->offset, chunk_len, 4);
|
||||
break;
|
||||
case TYPE_UINT64:
|
||||
- yaml_propval_int(emitter, m, data, chunk_len, 8);
|
||||
+ yaml_propval_int(emitter, markers, data, m->offset, chunk_len, 8);
|
||||
break;
|
||||
case TYPE_STRING:
|
||||
yaml_propval_string(emitter, data, chunk_len);
|
||||
break;
|
||||
default:
|
||||
- yaml_propval_int(emitter, m, data, chunk_len, 1);
|
||||
+ yaml_propval_int(emitter, markers, data, m->offset, chunk_len, 1);
|
||||
break;
|
||||
}
|
||||
}
|
@ -0,0 +1,48 @@
|
||||
From: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
|
||||
To: linus.walleij@linaro.org
|
||||
Cc: bjorn.andersson@linaro.org, dianders@chromium.org,
|
||||
linux-arm-msm@vger.kernel.org, linux-gpio@vger.kernel.org,
|
||||
linux-kernel@vger.kernel.org,
|
||||
Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
|
||||
Subject: [PATCH] pinctrl: qcom: Return -EINVAL for setting affinity if no IRQ
|
||||
parent
|
||||
Date: Thu, 13 Jan 2022 21:56:17 +0530
|
||||
Message-Id: <20220113162617.131697-1-manivannan.sadhasivam@linaro.org>
|
||||
|
||||
The MSM GPIO IRQ controller relies on the parent IRQ controller to set the
|
||||
CPU affinity for the IRQ. And this is only valid if there is any wakeup
|
||||
parent available and defined in DT.
|
||||
|
||||
For the case of no parent IRQ controller defined in DT,
|
||||
msm_gpio_irq_set_affinity() and msm_gpio_irq_set_vcpu_affinity() should
|
||||
return -EINVAL instead of 0 as the affinity can't be set.
|
||||
|
||||
Otherwise, below warning will be printed by genirq:
|
||||
|
||||
genirq: irq_chip msmgpio did not update eff. affinity mask of irq 70
|
||||
|
||||
Signed-off-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
|
||||
---
|
||||
drivers/pinctrl/qcom/pinctrl-msm.c | 4 ++--
|
||||
1 file changed, 2 insertions(+), 2 deletions(-)
|
||||
|
||||
--- a/drivers/pinctrl/qcom/pinctrl-msm.c
|
||||
+++ b/drivers/pinctrl/qcom/pinctrl-msm.c
|
||||
@@ -1157,7 +1157,7 @@ static int msm_gpio_irq_set_affinity(str
|
||||
if (d->parent_data && test_bit(d->hwirq, pctrl->skip_wake_irqs))
|
||||
return irq_chip_set_affinity_parent(d, dest, force);
|
||||
|
||||
- return 0;
|
||||
+ return -EINVAL;
|
||||
}
|
||||
|
||||
static int msm_gpio_irq_set_vcpu_affinity(struct irq_data *d, void *vcpu_info)
|
||||
@@ -1168,7 +1168,7 @@ static int msm_gpio_irq_set_vcpu_affinit
|
||||
if (d->parent_data && test_bit(d->hwirq, pctrl->skip_wake_irqs))
|
||||
return irq_chip_set_vcpu_affinity_parent(d, vcpu_info);
|
||||
|
||||
- return 0;
|
||||
+ return -EINVAL;
|
||||
}
|
||||
|
||||
static void msm_gpio_irq_handler(struct irq_desc *desc)
|
@ -0,0 +1,166 @@
|
||||
From b5af64fceb04dc298c5e69c517b4d83893ff060b Mon Sep 17 00:00:00 2001
|
||||
From: Bjorn Andersson <bjorn.andersson@linaro.org>
|
||||
Date: Thu, 30 Sep 2021 11:21:10 -0700
|
||||
Subject: [PATCH 1/1] soc: qcom: smem: Support reserved-memory description
|
||||
|
||||
Practically all modern Qualcomm platforms has a single reserved-memory
|
||||
region for SMEM. So rather than having to describe SMEM in the form of a
|
||||
node with a reference to a reserved-memory node, allow the SMEM device
|
||||
to be instantiated directly from the reserved-memory node.
|
||||
|
||||
The current means of falling back to dereferencing the "memory-region"
|
||||
is kept as a fallback, if it's determined that the SMEM node is a
|
||||
reserved-memory node.
|
||||
|
||||
The "qcom,smem" compatible is added to the reserved_mem_matches list, to
|
||||
allow the reserved-memory device to be probed.
|
||||
|
||||
In order to retain the readability of the code, the resolution of
|
||||
resources is split from the actual ioremapping.
|
||||
|
||||
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
|
||||
Acked-by: Rob Herring <robh@kernel.org>
|
||||
Reviewed-by: Vladimir Zapolskiy <vladimir.zapolskiy@linaro.org>
|
||||
Link: https://lore.kernel.org/r/20210930182111.57353-4-bjorn.andersson@linaro.org
|
||||
---
|
||||
drivers/of/platform.c | 1 +
|
||||
drivers/soc/qcom/smem.c | 57 ++++++++++++++++++++++++++++-------------
|
||||
2 files changed, 40 insertions(+), 18 deletions(-)
|
||||
|
||||
--- a/drivers/of/platform.c
|
||||
+++ b/drivers/of/platform.c
|
||||
@@ -509,6 +509,7 @@ EXPORT_SYMBOL_GPL(of_platform_default_po
|
||||
static const struct of_device_id reserved_mem_matches[] = {
|
||||
{ .compatible = "qcom,rmtfs-mem" },
|
||||
{ .compatible = "qcom,cmd-db" },
|
||||
+ { .compatible = "qcom,smem" },
|
||||
{ .compatible = "ramoops" },
|
||||
{ .compatible = "nvmem-rmem" },
|
||||
{}
|
||||
--- a/drivers/soc/qcom/smem.c
|
||||
+++ b/drivers/soc/qcom/smem.c
|
||||
@@ -9,6 +9,7 @@
|
||||
#include <linux/module.h>
|
||||
#include <linux/of.h>
|
||||
#include <linux/of_address.h>
|
||||
+#include <linux/of_reserved_mem.h>
|
||||
#include <linux/platform_device.h>
|
||||
#include <linux/sizes.h>
|
||||
#include <linux/slab.h>
|
||||
@@ -240,7 +241,7 @@ static const u8 SMEM_INFO_MAGIC[] = { 0x
|
||||
* @size: size of the memory region
|
||||
*/
|
||||
struct smem_region {
|
||||
- u32 aux_base;
|
||||
+ phys_addr_t aux_base;
|
||||
void __iomem *virt_base;
|
||||
size_t size;
|
||||
};
|
||||
@@ -499,7 +500,7 @@ static void *qcom_smem_get_global(struct
|
||||
for (i = 0; i < smem->num_regions; i++) {
|
||||
region = &smem->regions[i];
|
||||
|
||||
- if (region->aux_base == aux_base || !aux_base) {
|
||||
+ if ((u32)region->aux_base == aux_base || !aux_base) {
|
||||
if (size != NULL)
|
||||
*size = le32_to_cpu(entry->size);
|
||||
return region->virt_base + le32_to_cpu(entry->offset);
|
||||
@@ -664,7 +665,7 @@ phys_addr_t qcom_smem_virt_to_phys(void
|
||||
if (p < region->virt_base + region->size) {
|
||||
u64 offset = p - region->virt_base;
|
||||
|
||||
- return (phys_addr_t)region->aux_base + offset;
|
||||
+ return region->aux_base + offset;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -863,12 +864,12 @@ qcom_smem_enumerate_partitions(struct qc
|
||||
return 0;
|
||||
}
|
||||
|
||||
-static int qcom_smem_map_memory(struct qcom_smem *smem, struct device *dev,
|
||||
- const char *name, int i)
|
||||
+static int qcom_smem_resolve_mem(struct qcom_smem *smem, const char *name,
|
||||
+ struct smem_region *region)
|
||||
{
|
||||
+ struct device *dev = smem->dev;
|
||||
struct device_node *np;
|
||||
struct resource r;
|
||||
- resource_size_t size;
|
||||
int ret;
|
||||
|
||||
np = of_parse_phandle(dev->of_node, name, 0);
|
||||
@@ -881,13 +882,9 @@ static int qcom_smem_map_memory(struct q
|
||||
of_node_put(np);
|
||||
if (ret)
|
||||
return ret;
|
||||
- size = resource_size(&r);
|
||||
|
||||
- smem->regions[i].virt_base = devm_ioremap_wc(dev, r.start, size);
|
||||
- if (!smem->regions[i].virt_base)
|
||||
- return -ENOMEM;
|
||||
- smem->regions[i].aux_base = (u32)r.start;
|
||||
- smem->regions[i].size = size;
|
||||
+ region->aux_base = r.start;
|
||||
+ region->size = resource_size(&r);
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -895,12 +892,14 @@ static int qcom_smem_map_memory(struct q
|
||||
static int qcom_smem_probe(struct platform_device *pdev)
|
||||
{
|
||||
struct smem_header *header;
|
||||
+ struct reserved_mem *rmem;
|
||||
struct qcom_smem *smem;
|
||||
size_t array_size;
|
||||
int num_regions;
|
||||
int hwlock_id;
|
||||
u32 version;
|
||||
int ret;
|
||||
+ int i;
|
||||
|
||||
num_regions = 1;
|
||||
if (of_find_property(pdev->dev.of_node, "qcom,rpm-msg-ram", NULL))
|
||||
@@ -914,13 +913,35 @@ static int qcom_smem_probe(struct platfo
|
||||
smem->dev = &pdev->dev;
|
||||
smem->num_regions = num_regions;
|
||||
|
||||
- ret = qcom_smem_map_memory(smem, &pdev->dev, "memory-region", 0);
|
||||
- if (ret)
|
||||
- return ret;
|
||||
-
|
||||
- if (num_regions > 1 && (ret = qcom_smem_map_memory(smem, &pdev->dev,
|
||||
- "qcom,rpm-msg-ram", 1)))
|
||||
- return ret;
|
||||
+ rmem = of_reserved_mem_lookup(pdev->dev.of_node);
|
||||
+ if (rmem) {
|
||||
+ smem->regions[0].aux_base = rmem->base;
|
||||
+ smem->regions[0].size = rmem->size;
|
||||
+ } else {
|
||||
+ /*
|
||||
+ * Fall back to the memory-region reference, if we're not a
|
||||
+ * reserved-memory node.
|
||||
+ */
|
||||
+ ret = qcom_smem_resolve_mem(smem, "memory-region", &smem->regions[0]);
|
||||
+ if (ret)
|
||||
+ return ret;
|
||||
+ }
|
||||
+
|
||||
+ if (num_regions > 1) {
|
||||
+ ret = qcom_smem_resolve_mem(smem, "qcom,rpm-msg-ram", &smem->regions[1]);
|
||||
+ if (ret)
|
||||
+ return ret;
|
||||
+ }
|
||||
+
|
||||
+ for (i = 0; i < num_regions; i++) {
|
||||
+ smem->regions[i].virt_base = devm_ioremap_wc(&pdev->dev,
|
||||
+ smem->regions[i].aux_base,
|
||||
+ smem->regions[i].size);
|
||||
+ if (!smem->regions[i].virt_base) {
|
||||
+ dev_err(&pdev->dev, "failed to remap %pa\n", &smem->regions[i].aux_base);
|
||||
+ return -ENOMEM;
|
||||
+ }
|
||||
+ }
|
||||
|
||||
header = smem->regions[0].virt_base;
|
||||
if (le32_to_cpu(header->initialized) != 1 ||
|
@ -0,0 +1,33 @@
|
||||
From ee1a0696934a8b77a6a2098f92832c46d34ec5da Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= <rafal@milecki.pl>
|
||||
Date: Wed, 27 Oct 2021 14:31:35 +0200
|
||||
Subject: [PATCH] watchdog: bcm63xx_wdt: fix fallthrough warning
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
This fixes:
|
||||
drivers/watchdog/bcm63xx_wdt.c: In function 'bcm63xx_wdt_ioctl':
|
||||
drivers/watchdog/bcm63xx_wdt.c:208:17: warning: this statement may fall through [-Wimplicit-fallthrough=]
|
||||
|
||||
Signed-off-by: Rafał Miłecki <rafal@milecki.pl>
|
||||
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
|
||||
Reviewed-by: Guenter Roeck <linux@roeck-us.net>
|
||||
Link: https://lore.kernel.org/r/20211027123135.27458-1-zajec5@gmail.com
|
||||
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
|
||||
Signed-off-by: Wim Van Sebroeck <wim@linux-watchdog.org>
|
||||
---
|
||||
drivers/watchdog/bcm63xx_wdt.c | 2 ++
|
||||
1 file changed, 2 insertions(+)
|
||||
|
||||
--- a/drivers/watchdog/bcm63xx_wdt.c
|
||||
+++ b/drivers/watchdog/bcm63xx_wdt.c
|
||||
@@ -207,6 +207,8 @@ static long bcm63xx_wdt_ioctl(struct fil
|
||||
|
||||
bcm63xx_wdt_pet();
|
||||
|
||||
+ fallthrough;
|
||||
+
|
||||
case WDIOC_GETTIMEOUT:
|
||||
return put_user(wdt_time, p);
|
||||
|
@ -0,0 +1,162 @@
|
||||
From 626bfa03729959ea9917181fb3d8ffaa1594d02a Mon Sep 17 00:00:00 2001
|
||||
From: Hauke Mehrtens <hauke@hauke-m.de>
|
||||
Date: Wed, 13 Oct 2021 22:40:18 -0700
|
||||
Subject: [PATCH 1/1] MIPS: kernel: proc: add CPU option reporting
|
||||
|
||||
Many MIPS CPUs have optional CPU features which are not activated for
|
||||
all CPU cores. Print the CPU options, which are implemented in the core,
|
||||
in /proc/cpuinfo. This makes it possible to see which features are
|
||||
supported and which are not supported. This should cover all standard
|
||||
MIPS extensions. Before, it only printed information about the main MIPS
|
||||
ASEs.
|
||||
|
||||
Signed-off-by: Hauke Mehrtens <hauke@hauke-m.de>
|
||||
|
||||
Changes from original patch[0]:
|
||||
- Remove cpu_has_6k_cache and cpu_has_8k_cache due to commit 6ce91ba8589a
|
||||
("MIPS: Remove cpu_has_6k_cache and cpu_has_8k_cache in cpu_cache_init()")
|
||||
- Add new options: mac2008_only, ftlbparex, gsexcex, mmid, mm_sysad,
|
||||
mm_full
|
||||
- Use seq_puts instead of seq_printf as suggested by checkpatch
|
||||
- Minor commit message reword
|
||||
|
||||
[0]: https://lore.kernel.org/linux-mips/20181223225224.23042-1-hauke@hauke-m.de/
|
||||
|
||||
Signed-off-by: Ilya Lipnitskiy <ilya.lipnitskiy@gmail.com>
|
||||
Acked-by: Hauke Mehrtens <hauke@hauke-m.de>
|
||||
Signed-off-by: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
|
||||
---
|
||||
arch/mips/kernel/proc.c | 122 ++++++++++++++++++++++++++++++++++++++++
|
||||
1 file changed, 122 insertions(+)
|
||||
|
||||
--- a/arch/mips/kernel/proc.c
|
||||
+++ b/arch/mips/kernel/proc.c
|
||||
@@ -138,6 +138,128 @@ static int show_cpuinfo(struct seq_file
|
||||
seq_printf(m, "micromips kernel\t: %s\n",
|
||||
(read_c0_config3() & MIPS_CONF3_ISA_OE) ? "yes" : "no");
|
||||
}
|
||||
+
|
||||
+ seq_puts(m, "Options implemented\t:");
|
||||
+ if (cpu_has_tlb)
|
||||
+ seq_puts(m, " tlb");
|
||||
+ if (cpu_has_ftlb)
|
||||
+ seq_puts(m, " ftlb");
|
||||
+ if (cpu_has_tlbinv)
|
||||
+ seq_puts(m, " tlbinv");
|
||||
+ if (cpu_has_segments)
|
||||
+ seq_puts(m, " segments");
|
||||
+ if (cpu_has_rixiex)
|
||||
+ seq_puts(m, " rixiex");
|
||||
+ if (cpu_has_ldpte)
|
||||
+ seq_puts(m, " ldpte");
|
||||
+ if (cpu_has_maar)
|
||||
+ seq_puts(m, " maar");
|
||||
+ if (cpu_has_rw_llb)
|
||||
+ seq_puts(m, " rw_llb");
|
||||
+ if (cpu_has_4kex)
|
||||
+ seq_puts(m, " 4kex");
|
||||
+ if (cpu_has_3k_cache)
|
||||
+ seq_puts(m, " 3k_cache");
|
||||
+ if (cpu_has_4k_cache)
|
||||
+ seq_puts(m, " 4k_cache");
|
||||
+ if (cpu_has_tx39_cache)
|
||||
+ seq_puts(m, " tx39_cache");
|
||||
+ if (cpu_has_octeon_cache)
|
||||
+ seq_puts(m, " octeon_cache");
|
||||
+ if (cpu_has_fpu)
|
||||
+ seq_puts(m, " fpu");
|
||||
+ if (cpu_has_32fpr)
|
||||
+ seq_puts(m, " 32fpr");
|
||||
+ if (cpu_has_cache_cdex_p)
|
||||
+ seq_puts(m, " cache_cdex_p");
|
||||
+ if (cpu_has_cache_cdex_s)
|
||||
+ seq_puts(m, " cache_cdex_s");
|
||||
+ if (cpu_has_prefetch)
|
||||
+ seq_puts(m, " prefetch");
|
||||
+ if (cpu_has_mcheck)
|
||||
+ seq_puts(m, " mcheck");
|
||||
+ if (cpu_has_ejtag)
|
||||
+ seq_puts(m, " ejtag");
|
||||
+ if (cpu_has_llsc)
|
||||
+ seq_puts(m, " llsc");
|
||||
+ if (cpu_has_guestctl0ext)
|
||||
+ seq_puts(m, " guestctl0ext");
|
||||
+ if (cpu_has_guestctl1)
|
||||
+ seq_puts(m, " guestctl1");
|
||||
+ if (cpu_has_guestctl2)
|
||||
+ seq_puts(m, " guestctl2");
|
||||
+ if (cpu_has_guestid)
|
||||
+ seq_puts(m, " guestid");
|
||||
+ if (cpu_has_drg)
|
||||
+ seq_puts(m, " drg");
|
||||
+ if (cpu_has_rixi)
|
||||
+ seq_puts(m, " rixi");
|
||||
+ if (cpu_has_lpa)
|
||||
+ seq_puts(m, " lpa");
|
||||
+ if (cpu_has_mvh)
|
||||
+ seq_puts(m, " mvh");
|
||||
+ if (cpu_has_vtag_icache)
|
||||
+ seq_puts(m, " vtag_icache");
|
||||
+ if (cpu_has_dc_aliases)
|
||||
+ seq_puts(m, " dc_aliases");
|
||||
+ if (cpu_has_ic_fills_f_dc)
|
||||
+ seq_puts(m, " ic_fills_f_dc");
|
||||
+ if (cpu_has_pindexed_dcache)
|
||||
+ seq_puts(m, " pindexed_dcache");
|
||||
+ if (cpu_has_userlocal)
|
||||
+ seq_puts(m, " userlocal");
|
||||
+ if (cpu_has_nofpuex)
|
||||
+ seq_puts(m, " nofpuex");
|
||||
+ if (cpu_has_vint)
|
||||
+ seq_puts(m, " vint");
|
||||
+ if (cpu_has_veic)
|
||||
+ seq_puts(m, " veic");
|
||||
+ if (cpu_has_inclusive_pcaches)
|
||||
+ seq_puts(m, " inclusive_pcaches");
|
||||
+ if (cpu_has_perf_cntr_intr_bit)
|
||||
+ seq_puts(m, " perf_cntr_intr_bit");
|
||||
+ if (cpu_has_ufr)
|
||||
+ seq_puts(m, " ufr");
|
||||
+ if (cpu_has_fre)
|
||||
+ seq_puts(m, " fre");
|
||||
+ if (cpu_has_cdmm)
|
||||
+ seq_puts(m, " cdmm");
|
||||
+ if (cpu_has_small_pages)
|
||||
+ seq_puts(m, " small_pages");
|
||||
+ if (cpu_has_nan_legacy)
|
||||
+ seq_puts(m, " nan_legacy");
|
||||
+ if (cpu_has_nan_2008)
|
||||
+ seq_puts(m, " nan_2008");
|
||||
+ if (cpu_has_ebase_wg)
|
||||
+ seq_puts(m, " ebase_wg");
|
||||
+ if (cpu_has_badinstr)
|
||||
+ seq_puts(m, " badinstr");
|
||||
+ if (cpu_has_badinstrp)
|
||||
+ seq_puts(m, " badinstrp");
|
||||
+ if (cpu_has_contextconfig)
|
||||
+ seq_puts(m, " contextconfig");
|
||||
+ if (cpu_has_perf)
|
||||
+ seq_puts(m, " perf");
|
||||
+ if (cpu_has_mac2008_only)
|
||||
+ seq_puts(m, " mac2008_only");
|
||||
+ if (cpu_has_ftlbparex)
|
||||
+ seq_puts(m, " ftlbparex");
|
||||
+ if (cpu_has_gsexcex)
|
||||
+ seq_puts(m, " gsexcex");
|
||||
+ if (cpu_has_shared_ftlb_ram)
|
||||
+ seq_puts(m, " shared_ftlb_ram");
|
||||
+ if (cpu_has_shared_ftlb_entries)
|
||||
+ seq_puts(m, " shared_ftlb_entries");
|
||||
+ if (cpu_has_mipsmt_pertccounters)
|
||||
+ seq_puts(m, " mipsmt_pertccounters");
|
||||
+ if (cpu_has_mmid)
|
||||
+ seq_puts(m, " mmid");
|
||||
+ if (cpu_has_mm_sysad)
|
||||
+ seq_puts(m, " mm_sysad");
|
||||
+ if (cpu_has_mm_full)
|
||||
+ seq_puts(m, " mm_full");
|
||||
+ seq_puts(m, "\n");
|
||||
+
|
||||
seq_printf(m, "shadow register sets\t: %d\n",
|
||||
cpu_data[n].srsets);
|
||||
seq_printf(m, "kscratch registers\t: %d\n",
|
@ -0,0 +1,62 @@
|
||||
From 1cab5bd69eb1f995ced2d7576cb15f8a8941fd85 Mon Sep 17 00:00:00 2001
|
||||
From: Tiezhu Yang <yangtiezhu@loongson.cn>
|
||||
Date: Thu, 25 Nov 2021 19:39:32 +0800
|
||||
Subject: [PATCH 1/1] MIPS: Fix using smp_processor_id() in preemptible in
|
||||
show_cpuinfo()
|
||||
|
||||
There exists the following issue under DEBUG_PREEMPT:
|
||||
|
||||
BUG: using smp_processor_id() in preemptible [00000000] code: systemd/1
|
||||
caller is show_cpuinfo+0x460/0xea0
|
||||
...
|
||||
Call Trace:
|
||||
[<ffffffff8020f0dc>] show_stack+0x94/0x128
|
||||
[<ffffffff80e6cab4>] dump_stack_lvl+0x94/0xd8
|
||||
[<ffffffff80e74c5c>] check_preemption_disabled+0x104/0x110
|
||||
[<ffffffff802209c8>] show_cpuinfo+0x460/0xea0
|
||||
[<ffffffff80539d54>] seq_read_iter+0xfc/0x4f8
|
||||
[<ffffffff804fcc10>] new_sync_read+0x110/0x1b8
|
||||
[<ffffffff804ff57c>] vfs_read+0x1b4/0x1d0
|
||||
[<ffffffff804ffb18>] ksys_read+0xd0/0x110
|
||||
[<ffffffff8021c090>] syscall_common+0x34/0x58
|
||||
|
||||
We can see the following call trace:
|
||||
show_cpuinfo()
|
||||
cpu_has_fpu
|
||||
current_cpu_data
|
||||
smp_processor_id()
|
||||
|
||||
$ addr2line -f -e vmlinux 0xffffffff802209c8
|
||||
show_cpuinfo
|
||||
arch/mips/kernel/proc.c:188
|
||||
|
||||
$ head -188 arch/mips/kernel/proc.c | tail -1
|
||||
if (cpu_has_fpu)
|
||||
|
||||
arch/mips/include/asm/cpu-features.h
|
||||
# define cpu_has_fpu (current_cpu_data.options & MIPS_CPU_FPU)
|
||||
|
||||
arch/mips/include/asm/cpu-info.h
|
||||
#define current_cpu_data cpu_data[smp_processor_id()]
|
||||
|
||||
Based on the above analysis, fix the issue by using raw_cpu_has_fpu
|
||||
which calls raw_smp_processor_id() in show_cpuinfo().
|
||||
|
||||
Fixes: 626bfa037299 ("MIPS: kernel: proc: add CPU option reporting")
|
||||
Signed-off-by: Tiezhu Yang <yangtiezhu@loongson.cn>
|
||||
Signed-off-by: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
|
||||
---
|
||||
arch/mips/kernel/proc.c | 2 +-
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
--- a/arch/mips/kernel/proc.c
|
||||
+++ b/arch/mips/kernel/proc.c
|
||||
@@ -166,7 +166,7 @@ static int show_cpuinfo(struct seq_file
|
||||
seq_puts(m, " tx39_cache");
|
||||
if (cpu_has_octeon_cache)
|
||||
seq_puts(m, " octeon_cache");
|
||||
- if (cpu_has_fpu)
|
||||
+ if (raw_cpu_has_fpu)
|
||||
seq_puts(m, " fpu");
|
||||
if (cpu_has_32fpr)
|
||||
seq_puts(m, " 32fpr");
|
@ -0,0 +1,186 @@
|
||||
From f4c5c7f9d2e5ab005d57826b740b694b042a737c Mon Sep 17 00:00:00 2001
|
||||
From: Felix Matouschek <felix@matouschek.org>
|
||||
Date: Mon, 18 Apr 2022 15:28:03 +0200
|
||||
Subject: [PATCH 1/1] mtd: spinand: Add support for XTX XT26G0xA
|
||||
|
||||
Add support for XTX Technology XT26G01AXXXXX, XTX26G02AXXXXX and
|
||||
XTX26G04AXXXXX SPI NAND.
|
||||
|
||||
These are 3V, 1G/2G/4Gbit serial SLC NAND flash devices with on-die ECC
|
||||
(8bit strength per 512bytes).
|
||||
|
||||
Tested on Teltonika RUTX10 flashed with OpenWrt.
|
||||
|
||||
Links:
|
||||
- http://www.xtxtech.com/download/?AId=225
|
||||
- https://datasheet.lcsc.com/szlcsc/2005251034_XTX-XT26G01AWSEGA_C558841.pdf
|
||||
Signed-off-by: Felix Matouschek <felix@matouschek.org>
|
||||
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
|
||||
Link: https://lore.kernel.org/linux-mtd/20220418132803.664103-1-felix@matouschek.org
|
||||
---
|
||||
drivers/mtd/nand/spi/Makefile | 2 +-
|
||||
drivers/mtd/nand/spi/core.c | 1 +
|
||||
drivers/mtd/nand/spi/xtx.c | 129 ++++++++++++++++++++++++++++++++++
|
||||
include/linux/mtd/spinand.h | 1 +
|
||||
4 files changed, 132 insertions(+), 1 deletion(-)
|
||||
create mode 100644 drivers/mtd/nand/spi/xtx.c
|
||||
|
||||
--- a/drivers/mtd/nand/spi/Makefile
|
||||
+++ b/drivers/mtd/nand/spi/Makefile
|
||||
@@ -1,3 +1,3 @@
|
||||
# SPDX-License-Identifier: GPL-2.0
|
||||
-spinand-objs := core.o gigadevice.o macronix.o micron.o paragon.o toshiba.o winbond.o
|
||||
+spinand-objs := core.o gigadevice.o macronix.o micron.o paragon.o toshiba.o winbond.o xtx.o
|
||||
obj-$(CONFIG_MTD_SPI_NAND) += spinand.o
|
||||
--- a/drivers/mtd/nand/spi/core.c
|
||||
+++ b/drivers/mtd/nand/spi/core.c
|
||||
@@ -902,6 +902,7 @@ static const struct spinand_manufacturer
|
||||
¶gon_spinand_manufacturer,
|
||||
&toshiba_spinand_manufacturer,
|
||||
&winbond_spinand_manufacturer,
|
||||
+ &xtx_spinand_manufacturer,
|
||||
};
|
||||
|
||||
static int spinand_manufacturer_match(struct spinand_device *spinand,
|
||||
--- /dev/null
|
||||
+++ b/drivers/mtd/nand/spi/xtx.c
|
||||
@@ -0,0 +1,129 @@
|
||||
+// SPDX-License-Identifier: GPL-2.0
|
||||
+/*
|
||||
+ * Author:
|
||||
+ * Felix Matouschek <felix@matouschek.org>
|
||||
+ */
|
||||
+
|
||||
+#include <linux/device.h>
|
||||
+#include <linux/kernel.h>
|
||||
+#include <linux/mtd/spinand.h>
|
||||
+
|
||||
+#define SPINAND_MFR_XTX 0x0B
|
||||
+
|
||||
+#define XT26G0XA_STATUS_ECC_MASK GENMASK(5, 2)
|
||||
+#define XT26G0XA_STATUS_ECC_NO_DETECTED (0 << 2)
|
||||
+#define XT26G0XA_STATUS_ECC_8_CORRECTED (3 << 4)
|
||||
+#define XT26G0XA_STATUS_ECC_UNCOR_ERROR (2 << 4)
|
||||
+
|
||||
+static SPINAND_OP_VARIANTS(read_cache_variants,
|
||||
+ SPINAND_PAGE_READ_FROM_CACHE_QUADIO_OP(0, 1, NULL, 0),
|
||||
+ SPINAND_PAGE_READ_FROM_CACHE_X4_OP(0, 1, NULL, 0),
|
||||
+ SPINAND_PAGE_READ_FROM_CACHE_DUALIO_OP(0, 1, NULL, 0),
|
||||
+ SPINAND_PAGE_READ_FROM_CACHE_X2_OP(0, 1, NULL, 0),
|
||||
+ SPINAND_PAGE_READ_FROM_CACHE_OP(true, 0, 1, NULL, 0),
|
||||
+ SPINAND_PAGE_READ_FROM_CACHE_OP(false, 0, 1, NULL, 0));
|
||||
+
|
||||
+static SPINAND_OP_VARIANTS(write_cache_variants,
|
||||
+ SPINAND_PROG_LOAD_X4(true, 0, NULL, 0),
|
||||
+ SPINAND_PROG_LOAD(true, 0, NULL, 0));
|
||||
+
|
||||
+static SPINAND_OP_VARIANTS(update_cache_variants,
|
||||
+ SPINAND_PROG_LOAD_X4(false, 0, NULL, 0),
|
||||
+ SPINAND_PROG_LOAD(false, 0, NULL, 0));
|
||||
+
|
||||
+static int xt26g0xa_ooblayout_ecc(struct mtd_info *mtd, int section,
|
||||
+ struct mtd_oob_region *region)
|
||||
+{
|
||||
+ if (section)
|
||||
+ return -ERANGE;
|
||||
+
|
||||
+ region->offset = 48;
|
||||
+ region->length = 16;
|
||||
+
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
+static int xt26g0xa_ooblayout_free(struct mtd_info *mtd, int section,
|
||||
+ struct mtd_oob_region *region)
|
||||
+{
|
||||
+ if (section)
|
||||
+ return -ERANGE;
|
||||
+
|
||||
+ region->offset = 1;
|
||||
+ region->length = 47;
|
||||
+
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
+static const struct mtd_ooblayout_ops xt26g0xa_ooblayout = {
|
||||
+ .ecc = xt26g0xa_ooblayout_ecc,
|
||||
+ .free = xt26g0xa_ooblayout_free,
|
||||
+};
|
||||
+
|
||||
+static int xt26g0xa_ecc_get_status(struct spinand_device *spinand,
|
||||
+ u8 status)
|
||||
+{
|
||||
+ status = status & XT26G0XA_STATUS_ECC_MASK;
|
||||
+
|
||||
+ switch (status) {
|
||||
+ case XT26G0XA_STATUS_ECC_NO_DETECTED:
|
||||
+ return 0;
|
||||
+ case XT26G0XA_STATUS_ECC_8_CORRECTED:
|
||||
+ return 8;
|
||||
+ case XT26G0XA_STATUS_ECC_UNCOR_ERROR:
|
||||
+ return -EBADMSG;
|
||||
+ default:
|
||||
+ break;
|
||||
+ }
|
||||
+
|
||||
+ /* At this point values greater than (2 << 4) are invalid */
|
||||
+ if (status > XT26G0XA_STATUS_ECC_UNCOR_ERROR)
|
||||
+ return -EINVAL;
|
||||
+
|
||||
+ /* (1 << 2) through (7 << 2) are 1-7 corrected errors */
|
||||
+ return status >> 2;
|
||||
+}
|
||||
+
|
||||
+static const struct spinand_info xtx_spinand_table[] = {
|
||||
+ SPINAND_INFO("XT26G01A",
|
||||
+ SPINAND_ID(SPINAND_READID_METHOD_OPCODE_ADDR, 0xE1),
|
||||
+ NAND_MEMORG(1, 2048, 64, 64, 1024, 20, 1, 1, 1),
|
||||
+ NAND_ECCREQ(8, 512),
|
||||
+ SPINAND_INFO_OP_VARIANTS(&read_cache_variants,
|
||||
+ &write_cache_variants,
|
||||
+ &update_cache_variants),
|
||||
+ SPINAND_HAS_QE_BIT,
|
||||
+ SPINAND_ECCINFO(&xt26g0xa_ooblayout,
|
||||
+ xt26g0xa_ecc_get_status)),
|
||||
+ SPINAND_INFO("XT26G02A",
|
||||
+ SPINAND_ID(SPINAND_READID_METHOD_OPCODE_ADDR, 0xE2),
|
||||
+ NAND_MEMORG(1, 2048, 64, 64, 2048, 40, 1, 1, 1),
|
||||
+ NAND_ECCREQ(8, 512),
|
||||
+ SPINAND_INFO_OP_VARIANTS(&read_cache_variants,
|
||||
+ &write_cache_variants,
|
||||
+ &update_cache_variants),
|
||||
+ SPINAND_HAS_QE_BIT,
|
||||
+ SPINAND_ECCINFO(&xt26g0xa_ooblayout,
|
||||
+ xt26g0xa_ecc_get_status)),
|
||||
+ SPINAND_INFO("XT26G04A",
|
||||
+ SPINAND_ID(SPINAND_READID_METHOD_OPCODE_ADDR, 0xE3),
|
||||
+ NAND_MEMORG(1, 2048, 64, 128, 2048, 40, 1, 1, 1),
|
||||
+ NAND_ECCREQ(8, 512),
|
||||
+ SPINAND_INFO_OP_VARIANTS(&read_cache_variants,
|
||||
+ &write_cache_variants,
|
||||
+ &update_cache_variants),
|
||||
+ SPINAND_HAS_QE_BIT,
|
||||
+ SPINAND_ECCINFO(&xt26g0xa_ooblayout,
|
||||
+ xt26g0xa_ecc_get_status)),
|
||||
+};
|
||||
+
|
||||
+static const struct spinand_manufacturer_ops xtx_spinand_manuf_ops = {
|
||||
+};
|
||||
+
|
||||
+const struct spinand_manufacturer xtx_spinand_manufacturer = {
|
||||
+ .id = SPINAND_MFR_XTX,
|
||||
+ .name = "XTX",
|
||||
+ .chips = xtx_spinand_table,
|
||||
+ .nchips = ARRAY_SIZE(xtx_spinand_table),
|
||||
+ .ops = &xtx_spinand_manuf_ops,
|
||||
+};
|
||||
--- a/include/linux/mtd/spinand.h
|
||||
+++ b/include/linux/mtd/spinand.h
|
||||
@@ -266,6 +266,7 @@ extern const struct spinand_manufacturer
|
||||
extern const struct spinand_manufacturer paragon_spinand_manufacturer;
|
||||
extern const struct spinand_manufacturer toshiba_spinand_manufacturer;
|
||||
extern const struct spinand_manufacturer winbond_spinand_manufacturer;
|
||||
+extern const struct spinand_manufacturer xtx_spinand_manufacturer;
|
||||
|
||||
/**
|
||||
* struct spinand_op_variants - SPI NAND operation variants
|
@ -0,0 +1,219 @@
|
||||
From 4bf18d5a2dd02db8c5b16a2cfae513510506df5b Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Pali=20Roh=C3=A1r?= <pali@kernel.org>
|
||||
Date: Thu, 3 Feb 2022 22:44:40 +0100
|
||||
Subject: [PATCH 1/2] phy: marvell: phy-mvebu-a3700-comphy: Remove port from
|
||||
driver configuration
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
Port number is encoded into argument for SMC call. It is zero for SATA,
|
||||
PCIe and also both USB 3.0 PHYs. It is non-zero only for Ethernet PHY
|
||||
(incorrectly called SGMII) on lane 0. Ethernet PHY on lane 1 also uses zero
|
||||
port number.
|
||||
|
||||
So construct "port" bits for SMC call argument can be constructed directly
|
||||
from PHY type and lane number.
|
||||
|
||||
Change driver code to always pass zero port number for non-ethernet PHYs
|
||||
and for ethernet PHYs determinate port number from lane number. This
|
||||
simplifies the driver.
|
||||
|
||||
As port number from DT PHY configuration is not used anymore, remove whole
|
||||
driver code which parses it. This also simplifies the driver.
|
||||
|
||||
Signed-off-by: Pali Rohár <pali@kernel.org>
|
||||
Signed-off-by: Marek Behún <kabel@kernel.org>
|
||||
Reviewed-by: Miquel Raynal <miquel.raynal@bootlin.com>
|
||||
Link: https://lore.kernel.org/r/20220203214444.1508-2-kabel@kernel.org
|
||||
Signed-off-by: Vinod Koul <vkoul@kernel.org>
|
||||
---
|
||||
drivers/phy/marvell/phy-mvebu-a3700-comphy.c | 62 +++++++++-----------
|
||||
1 file changed, 29 insertions(+), 33 deletions(-)
|
||||
|
||||
--- a/drivers/phy/marvell/phy-mvebu-a3700-comphy.c
|
||||
+++ b/drivers/phy/marvell/phy-mvebu-a3700-comphy.c
|
||||
@@ -20,7 +20,6 @@
|
||||
#include <linux/platform_device.h>
|
||||
|
||||
#define MVEBU_A3700_COMPHY_LANES 3
|
||||
-#define MVEBU_A3700_COMPHY_PORTS 2
|
||||
|
||||
/* COMPHY Fast SMC function identifiers */
|
||||
#define COMPHY_SIP_POWER_ON 0x82000001
|
||||
@@ -45,51 +44,47 @@
|
||||
#define COMPHY_FW_NET(mode, idx, speed) (COMPHY_FW_MODE(mode) | \
|
||||
((idx) << 8) | \
|
||||
((speed) << 2))
|
||||
-#define COMPHY_FW_PCIE(mode, idx, speed, width) (COMPHY_FW_NET(mode, idx, speed) | \
|
||||
+#define COMPHY_FW_PCIE(mode, speed, width) (COMPHY_FW_NET(mode, 0, speed) | \
|
||||
((width) << 18))
|
||||
|
||||
struct mvebu_a3700_comphy_conf {
|
||||
unsigned int lane;
|
||||
enum phy_mode mode;
|
||||
int submode;
|
||||
- unsigned int port;
|
||||
u32 fw_mode;
|
||||
};
|
||||
|
||||
-#define MVEBU_A3700_COMPHY_CONF(_lane, _mode, _smode, _port, _fw) \
|
||||
+#define MVEBU_A3700_COMPHY_CONF(_lane, _mode, _smode, _fw) \
|
||||
{ \
|
||||
.lane = _lane, \
|
||||
.mode = _mode, \
|
||||
.submode = _smode, \
|
||||
- .port = _port, \
|
||||
.fw_mode = _fw, \
|
||||
}
|
||||
|
||||
-#define MVEBU_A3700_COMPHY_CONF_GEN(_lane, _mode, _port, _fw) \
|
||||
- MVEBU_A3700_COMPHY_CONF(_lane, _mode, PHY_INTERFACE_MODE_NA, _port, _fw)
|
||||
+#define MVEBU_A3700_COMPHY_CONF_GEN(_lane, _mode, _fw) \
|
||||
+ MVEBU_A3700_COMPHY_CONF(_lane, _mode, PHY_INTERFACE_MODE_NA, _fw)
|
||||
|
||||
-#define MVEBU_A3700_COMPHY_CONF_ETH(_lane, _smode, _port, _fw) \
|
||||
- MVEBU_A3700_COMPHY_CONF(_lane, PHY_MODE_ETHERNET, _smode, _port, _fw)
|
||||
+#define MVEBU_A3700_COMPHY_CONF_ETH(_lane, _smode, _fw) \
|
||||
+ MVEBU_A3700_COMPHY_CONF(_lane, PHY_MODE_ETHERNET, _smode, _fw)
|
||||
|
||||
static const struct mvebu_a3700_comphy_conf mvebu_a3700_comphy_modes[] = {
|
||||
/* lane 0 */
|
||||
- MVEBU_A3700_COMPHY_CONF_GEN(0, PHY_MODE_USB_HOST_SS, 0,
|
||||
+ MVEBU_A3700_COMPHY_CONF_GEN(0, PHY_MODE_USB_HOST_SS,
|
||||
COMPHY_FW_MODE_USB3H),
|
||||
- MVEBU_A3700_COMPHY_CONF_ETH(0, PHY_INTERFACE_MODE_SGMII, 1,
|
||||
+ MVEBU_A3700_COMPHY_CONF_ETH(0, PHY_INTERFACE_MODE_SGMII,
|
||||
COMPHY_FW_MODE_SGMII),
|
||||
- MVEBU_A3700_COMPHY_CONF_ETH(0, PHY_INTERFACE_MODE_2500BASEX, 1,
|
||||
+ MVEBU_A3700_COMPHY_CONF_ETH(0, PHY_INTERFACE_MODE_2500BASEX,
|
||||
COMPHY_FW_MODE_2500BASEX),
|
||||
/* lane 1 */
|
||||
- MVEBU_A3700_COMPHY_CONF_GEN(1, PHY_MODE_PCIE, 0,
|
||||
- COMPHY_FW_MODE_PCIE),
|
||||
- MVEBU_A3700_COMPHY_CONF_ETH(1, PHY_INTERFACE_MODE_SGMII, 0,
|
||||
+ MVEBU_A3700_COMPHY_CONF_GEN(1, PHY_MODE_PCIE, COMPHY_FW_MODE_PCIE),
|
||||
+ MVEBU_A3700_COMPHY_CONF_ETH(1, PHY_INTERFACE_MODE_SGMII,
|
||||
COMPHY_FW_MODE_SGMII),
|
||||
- MVEBU_A3700_COMPHY_CONF_ETH(1, PHY_INTERFACE_MODE_2500BASEX, 0,
|
||||
+ MVEBU_A3700_COMPHY_CONF_ETH(1, PHY_INTERFACE_MODE_2500BASEX,
|
||||
COMPHY_FW_MODE_2500BASEX),
|
||||
/* lane 2 */
|
||||
- MVEBU_A3700_COMPHY_CONF_GEN(2, PHY_MODE_SATA, 0,
|
||||
- COMPHY_FW_MODE_SATA),
|
||||
- MVEBU_A3700_COMPHY_CONF_GEN(2, PHY_MODE_USB_HOST_SS, 0,
|
||||
+ MVEBU_A3700_COMPHY_CONF_GEN(2, PHY_MODE_SATA, COMPHY_FW_MODE_SATA),
|
||||
+ MVEBU_A3700_COMPHY_CONF_GEN(2, PHY_MODE_USB_HOST_SS,
|
||||
COMPHY_FW_MODE_USB3H),
|
||||
};
|
||||
|
||||
@@ -98,7 +93,6 @@ struct mvebu_a3700_comphy_lane {
|
||||
unsigned int id;
|
||||
enum phy_mode mode;
|
||||
int submode;
|
||||
- int port;
|
||||
};
|
||||
|
||||
static int mvebu_a3700_comphy_smc(unsigned long function, unsigned long lane,
|
||||
@@ -120,7 +114,7 @@ static int mvebu_a3700_comphy_smc(unsign
|
||||
}
|
||||
}
|
||||
|
||||
-static int mvebu_a3700_comphy_get_fw_mode(int lane, int port,
|
||||
+static int mvebu_a3700_comphy_get_fw_mode(int lane,
|
||||
enum phy_mode mode,
|
||||
int submode)
|
||||
{
|
||||
@@ -132,7 +126,6 @@ static int mvebu_a3700_comphy_get_fw_mod
|
||||
|
||||
for (i = 0; i < n; i++) {
|
||||
if (mvebu_a3700_comphy_modes[i].lane == lane &&
|
||||
- mvebu_a3700_comphy_modes[i].port == port &&
|
||||
mvebu_a3700_comphy_modes[i].mode == mode &&
|
||||
mvebu_a3700_comphy_modes[i].submode == submode)
|
||||
break;
|
||||
@@ -153,7 +146,7 @@ static int mvebu_a3700_comphy_set_mode(s
|
||||
if (submode == PHY_INTERFACE_MODE_1000BASEX)
|
||||
submode = PHY_INTERFACE_MODE_SGMII;
|
||||
|
||||
- fw_mode = mvebu_a3700_comphy_get_fw_mode(lane->id, lane->port, mode,
|
||||
+ fw_mode = mvebu_a3700_comphy_get_fw_mode(lane->id, mode,
|
||||
submode);
|
||||
if (fw_mode < 0) {
|
||||
dev_err(lane->dev, "invalid COMPHY mode\n");
|
||||
@@ -172,9 +165,10 @@ static int mvebu_a3700_comphy_power_on(s
|
||||
struct mvebu_a3700_comphy_lane *lane = phy_get_drvdata(phy);
|
||||
u32 fw_param;
|
||||
int fw_mode;
|
||||
+ int fw_port;
|
||||
int ret;
|
||||
|
||||
- fw_mode = mvebu_a3700_comphy_get_fw_mode(lane->id, lane->port,
|
||||
+ fw_mode = mvebu_a3700_comphy_get_fw_mode(lane->id,
|
||||
lane->mode, lane->submode);
|
||||
if (fw_mode < 0) {
|
||||
dev_err(lane->dev, "invalid COMPHY mode\n");
|
||||
@@ -191,17 +185,18 @@ static int mvebu_a3700_comphy_power_on(s
|
||||
fw_param = COMPHY_FW_MODE(fw_mode);
|
||||
break;
|
||||
case PHY_MODE_ETHERNET:
|
||||
+ fw_port = (lane->id == 0) ? 1 : 0;
|
||||
switch (lane->submode) {
|
||||
case PHY_INTERFACE_MODE_SGMII:
|
||||
dev_dbg(lane->dev, "set lane %d to SGMII mode\n",
|
||||
lane->id);
|
||||
- fw_param = COMPHY_FW_NET(fw_mode, lane->port,
|
||||
+ fw_param = COMPHY_FW_NET(fw_mode, fw_port,
|
||||
COMPHY_FW_SPEED_1_25G);
|
||||
break;
|
||||
case PHY_INTERFACE_MODE_2500BASEX:
|
||||
dev_dbg(lane->dev, "set lane %d to 2500BASEX mode\n",
|
||||
lane->id);
|
||||
- fw_param = COMPHY_FW_NET(fw_mode, lane->port,
|
||||
+ fw_param = COMPHY_FW_NET(fw_mode, fw_port,
|
||||
COMPHY_FW_SPEED_3_125G);
|
||||
break;
|
||||
default:
|
||||
@@ -212,8 +207,7 @@ static int mvebu_a3700_comphy_power_on(s
|
||||
break;
|
||||
case PHY_MODE_PCIE:
|
||||
dev_dbg(lane->dev, "set lane %d to PCIe mode\n", lane->id);
|
||||
- fw_param = COMPHY_FW_PCIE(fw_mode, lane->port,
|
||||
- COMPHY_FW_SPEED_5G,
|
||||
+ fw_param = COMPHY_FW_PCIE(fw_mode, COMPHY_FW_SPEED_5G,
|
||||
phy->attrs.bus_width);
|
||||
break;
|
||||
default:
|
||||
@@ -247,17 +241,20 @@ static struct phy *mvebu_a3700_comphy_xl
|
||||
struct of_phandle_args *args)
|
||||
{
|
||||
struct mvebu_a3700_comphy_lane *lane;
|
||||
+ unsigned int port;
|
||||
struct phy *phy;
|
||||
|
||||
- if (WARN_ON(args->args[0] >= MVEBU_A3700_COMPHY_PORTS))
|
||||
- return ERR_PTR(-EINVAL);
|
||||
-
|
||||
phy = of_phy_simple_xlate(dev, args);
|
||||
if (IS_ERR(phy))
|
||||
return phy;
|
||||
|
||||
lane = phy_get_drvdata(phy);
|
||||
- lane->port = args->args[0];
|
||||
+
|
||||
+ port = args->args[0];
|
||||
+ if (port != 0 && (port != 1 || lane->id != 0)) {
|
||||
+ dev_err(lane->dev, "invalid port number %u\n", port);
|
||||
+ return ERR_PTR(-EINVAL);
|
||||
+ }
|
||||
|
||||
return phy;
|
||||
}
|
||||
@@ -302,7 +299,6 @@ static int mvebu_a3700_comphy_probe(stru
|
||||
lane->mode = PHY_MODE_INVALID;
|
||||
lane->submode = PHY_INTERFACE_MODE_NA;
|
||||
lane->id = lane_id;
|
||||
- lane->port = -1;
|
||||
phy_set_drvdata(phy, lane);
|
||||
}
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,32 @@
|
||||
From 73a78b6130d9e13daca22b86ad52f063b9403e03 Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Pali=20Roh=C3=A1r?= <pali@kernel.org>
|
||||
Date: Wed, 8 Dec 2021 03:40:35 +0100
|
||||
Subject: [PATCH 1/1] arm64: dts: marvell: armada-37xx: Add xtal clock to
|
||||
comphy node
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
Kernel driver phy-mvebu-a3700-comphy.c needs to know the rate of the
|
||||
reference xtal clock. So add missing xtal clock source into comphy device
|
||||
tree node. If the property is not present, the driver defaults to 25 MHz
|
||||
xtal rate (which, as far as we know, is used by all the existing boards).
|
||||
|
||||
Signed-off-by: Pali Rohár <pali@kernel.org>
|
||||
Signed-off-by: Marek Behún <kabel@kernel.org>
|
||||
Signed-off-by: Gregory CLEMENT <gregory.clement@bootlin.com>
|
||||
---
|
||||
arch/arm64/boot/dts/marvell/armada-37xx.dtsi | 2 ++
|
||||
1 file changed, 2 insertions(+)
|
||||
|
||||
--- a/arch/arm64/boot/dts/marvell/armada-37xx.dtsi
|
||||
+++ b/arch/arm64/boot/dts/marvell/armada-37xx.dtsi
|
||||
@@ -265,6 +265,8 @@
|
||||
"lane2_sata_usb3";
|
||||
#address-cells = <1>;
|
||||
#size-cells = <0>;
|
||||
+ clocks = <&xtalclk>;
|
||||
+ clock-names = "xtal";
|
||||
|
||||
comphy0: phy@0 {
|
||||
reg = <0>;
|
@ -0,0 +1,64 @@
|
||||
From ee995101fde67f85a3cd4c74f4f92fc4592e726b Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Pali=20Roh=C3=A1r?= <pali@kernel.org>
|
||||
Date: Thu, 3 Feb 2022 22:44:42 +0100
|
||||
Subject: [PATCH 1/3] Revert "ata: ahci: mvebu: Make SATA PHY optional for
|
||||
Armada 3720"
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
This reverts commit 45aefe3d2251e4e229d7662052739f96ad1d08d9.
|
||||
|
||||
Armada 3720 PHY driver (phy-mvebu-a3700-comphy.c) does not return
|
||||
-EOPNOTSUPP from phy_power_on() callback anymore.
|
||||
|
||||
So remove AHCI_HFLAG_IGN_NOTSUPP_POWER_ON flag from Armada 3720 plat data.
|
||||
|
||||
AHCI_HFLAG_IGN_NOTSUPP_POWER_ON is not used by any other ahci driver, so
|
||||
remove this flag completely.
|
||||
|
||||
Signed-off-by: Pali Rohár <pali@kernel.org>
|
||||
Signed-off-by: Marek Behún <kabel@kernel.org>
|
||||
Acked-by: Miquel Raynal <miquel.raynal@bootlin.com>
|
||||
Acked-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
|
||||
Link: https://lore.kernel.org/r/20220203214444.1508-4-kabel@kernel.org
|
||||
Signed-off-by: Vinod Koul <vkoul@kernel.org>
|
||||
---
|
||||
drivers/ata/ahci.h | 2 --
|
||||
drivers/ata/ahci_mvebu.c | 2 +-
|
||||
drivers/ata/libahci_platform.c | 2 +-
|
||||
3 files changed, 2 insertions(+), 4 deletions(-)
|
||||
|
||||
--- a/drivers/ata/ahci.h
|
||||
+++ b/drivers/ata/ahci.h
|
||||
@@ -240,8 +240,6 @@ enum {
|
||||
as default lpm_policy */
|
||||
AHCI_HFLAG_SUSPEND_PHYS = (1 << 26), /* handle PHYs during
|
||||
suspend/resume */
|
||||
- AHCI_HFLAG_IGN_NOTSUPP_POWER_ON = (1 << 27), /* ignore -EOPNOTSUPP
|
||||
- from phy_power_on() */
|
||||
AHCI_HFLAG_NO_SXS = (1 << 28), /* SXS not supported */
|
||||
|
||||
/* ap->flags bits */
|
||||
--- a/drivers/ata/ahci_mvebu.c
|
||||
+++ b/drivers/ata/ahci_mvebu.c
|
||||
@@ -227,7 +227,7 @@ static const struct ahci_mvebu_plat_data
|
||||
|
||||
static const struct ahci_mvebu_plat_data ahci_mvebu_armada_3700_plat_data = {
|
||||
.plat_config = ahci_mvebu_armada_3700_config,
|
||||
- .flags = AHCI_HFLAG_SUSPEND_PHYS | AHCI_HFLAG_IGN_NOTSUPP_POWER_ON,
|
||||
+ .flags = AHCI_HFLAG_SUSPEND_PHYS,
|
||||
};
|
||||
|
||||
static const struct of_device_id ahci_mvebu_of_match[] = {
|
||||
--- a/drivers/ata/libahci_platform.c
|
||||
+++ b/drivers/ata/libahci_platform.c
|
||||
@@ -59,7 +59,7 @@ int ahci_platform_enable_phys(struct ahc
|
||||
}
|
||||
|
||||
rc = phy_power_on(hpriv->phys[i]);
|
||||
- if (rc && !(rc == -EOPNOTSUPP && (hpriv->flags & AHCI_HFLAG_IGN_NOTSUPP_POWER_ON))) {
|
||||
+ if (rc) {
|
||||
phy_exit(hpriv->phys[i]);
|
||||
goto disable_phys;
|
||||
}
|
@ -0,0 +1,166 @@
|
||||
From 8e10548f7f4814e530857d2049d6af6bc78add53 Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Pali=20Roh=C3=A1r?= <pali@kernel.org>
|
||||
Date: Thu, 3 Feb 2022 22:44:43 +0100
|
||||
Subject: [PATCH 2/3] Revert "usb: host: xhci: mvebu: make USB 3.0 PHY optional
|
||||
for Armada 3720"
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
This reverts commit 3241929b67d28c83945d3191c6816a3271fd6b85.
|
||||
|
||||
Armada 3720 phy driver (phy-mvebu-a3700-comphy.c) does not return
|
||||
-EOPNOTSUPP from phy_power_on() callback anymore.
|
||||
|
||||
So remove XHCI_SKIP_PHY_INIT flag from xhci_mvebu_a3700_plat_setup() and
|
||||
then also whole xhci_mvebu_a3700_plat_setup() function which is there just
|
||||
to handle -EOPNOTSUPP for XHCI_SKIP_PHY_INIT.
|
||||
|
||||
xhci plat_setup callback is not used by any other xhci plat driver, so
|
||||
remove this callback completely.
|
||||
|
||||
Signed-off-by: Pali Rohár <pali@kernel.org>
|
||||
Signed-off-by: Marek Behún <kabel@kernel.org>
|
||||
Acked-by: Miquel Raynal <miquel.raynal@bootlin.com>
|
||||
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
|
||||
Link: https://lore.kernel.org/r/20220203214444.1508-5-kabel@kernel.org
|
||||
Signed-off-by: Vinod Koul <vkoul@kernel.org>
|
||||
---
|
||||
drivers/usb/host/xhci-mvebu.c | 42 -----------------------------------
|
||||
drivers/usb/host/xhci-mvebu.h | 6 -----
|
||||
drivers/usb/host/xhci-plat.c | 20 +----------------
|
||||
drivers/usb/host/xhci-plat.h | 1 -
|
||||
4 files changed, 1 insertion(+), 68 deletions(-)
|
||||
|
||||
--- a/drivers/usb/host/xhci-mvebu.c
|
||||
+++ b/drivers/usb/host/xhci-mvebu.c
|
||||
@@ -8,7 +8,6 @@
|
||||
#include <linux/mbus.h>
|
||||
#include <linux/of.h>
|
||||
#include <linux/platform_device.h>
|
||||
-#include <linux/phy/phy.h>
|
||||
|
||||
#include <linux/usb.h>
|
||||
#include <linux/usb/hcd.h>
|
||||
@@ -74,47 +73,6 @@ int xhci_mvebu_mbus_init_quirk(struct us
|
||||
|
||||
return 0;
|
||||
}
|
||||
-
|
||||
-int xhci_mvebu_a3700_plat_setup(struct usb_hcd *hcd)
|
||||
-{
|
||||
- struct xhci_hcd *xhci = hcd_to_xhci(hcd);
|
||||
- struct device *dev = hcd->self.controller;
|
||||
- struct phy *phy;
|
||||
- int ret;
|
||||
-
|
||||
- /* Old bindings miss the PHY handle */
|
||||
- phy = of_phy_get(dev->of_node, "usb3-phy");
|
||||
- if (IS_ERR(phy) && PTR_ERR(phy) == -EPROBE_DEFER)
|
||||
- return -EPROBE_DEFER;
|
||||
- else if (IS_ERR(phy))
|
||||
- goto phy_out;
|
||||
-
|
||||
- ret = phy_init(phy);
|
||||
- if (ret)
|
||||
- goto phy_put;
|
||||
-
|
||||
- ret = phy_set_mode(phy, PHY_MODE_USB_HOST_SS);
|
||||
- if (ret)
|
||||
- goto phy_exit;
|
||||
-
|
||||
- ret = phy_power_on(phy);
|
||||
- if (ret == -EOPNOTSUPP) {
|
||||
- /* Skip initializatin of XHCI PHY when it is unsupported by firmware */
|
||||
- dev_warn(dev, "PHY unsupported by firmware\n");
|
||||
- xhci->quirks |= XHCI_SKIP_PHY_INIT;
|
||||
- }
|
||||
- if (ret)
|
||||
- goto phy_exit;
|
||||
-
|
||||
- phy_power_off(phy);
|
||||
-phy_exit:
|
||||
- phy_exit(phy);
|
||||
-phy_put:
|
||||
- of_phy_put(phy);
|
||||
-phy_out:
|
||||
-
|
||||
- return 0;
|
||||
-}
|
||||
|
||||
int xhci_mvebu_a3700_init_quirk(struct usb_hcd *hcd)
|
||||
{
|
||||
--- a/drivers/usb/host/xhci-mvebu.h
|
||||
+++ b/drivers/usb/host/xhci-mvebu.h
|
||||
@@ -12,18 +12,12 @@ struct usb_hcd;
|
||||
|
||||
#if IS_ENABLED(CONFIG_USB_XHCI_MVEBU)
|
||||
int xhci_mvebu_mbus_init_quirk(struct usb_hcd *hcd);
|
||||
-int xhci_mvebu_a3700_plat_setup(struct usb_hcd *hcd);
|
||||
int xhci_mvebu_a3700_init_quirk(struct usb_hcd *hcd);
|
||||
#else
|
||||
static inline int xhci_mvebu_mbus_init_quirk(struct usb_hcd *hcd)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
-
|
||||
-static inline int xhci_mvebu_a3700_plat_setup(struct usb_hcd *hcd)
|
||||
-{
|
||||
- return 0;
|
||||
-}
|
||||
|
||||
static inline int xhci_mvebu_a3700_init_quirk(struct usb_hcd *hcd)
|
||||
{
|
||||
--- a/drivers/usb/host/xhci-plat.c
|
||||
+++ b/drivers/usb/host/xhci-plat.c
|
||||
@@ -44,16 +44,6 @@ static void xhci_priv_plat_start(struct
|
||||
priv->plat_start(hcd);
|
||||
}
|
||||
|
||||
-static int xhci_priv_plat_setup(struct usb_hcd *hcd)
|
||||
-{
|
||||
- struct xhci_plat_priv *priv = hcd_to_xhci_priv(hcd);
|
||||
-
|
||||
- if (!priv->plat_setup)
|
||||
- return 0;
|
||||
-
|
||||
- return priv->plat_setup(hcd);
|
||||
-}
|
||||
-
|
||||
static int xhci_priv_init_quirk(struct usb_hcd *hcd)
|
||||
{
|
||||
struct xhci_plat_priv *priv = hcd_to_xhci_priv(hcd);
|
||||
@@ -121,7 +111,6 @@ static const struct xhci_plat_priv xhci_
|
||||
};
|
||||
|
||||
static const struct xhci_plat_priv xhci_plat_marvell_armada3700 = {
|
||||
- .plat_setup = xhci_mvebu_a3700_plat_setup,
|
||||
.init_quirk = xhci_mvebu_a3700_init_quirk,
|
||||
};
|
||||
|
||||
@@ -341,14 +330,7 @@ static int xhci_plat_probe(struct platfo
|
||||
|
||||
hcd->tpl_support = of_usb_host_tpl_support(sysdev->of_node);
|
||||
xhci->shared_hcd->tpl_support = hcd->tpl_support;
|
||||
-
|
||||
- if (priv) {
|
||||
- ret = xhci_priv_plat_setup(hcd);
|
||||
- if (ret)
|
||||
- goto disable_usb_phy;
|
||||
- }
|
||||
-
|
||||
- if ((xhci->quirks & XHCI_SKIP_PHY_INIT) || (priv && (priv->quirks & XHCI_SKIP_PHY_INIT)))
|
||||
+ if (priv && (priv->quirks & XHCI_SKIP_PHY_INIT))
|
||||
hcd->skip_phy_initialization = 1;
|
||||
|
||||
if (priv && (priv->quirks & XHCI_SG_TRB_CACHE_SIZE_QUIRK))
|
||||
--- a/drivers/usb/host/xhci-plat.h
|
||||
+++ b/drivers/usb/host/xhci-plat.h
|
||||
@@ -13,7 +13,6 @@
|
||||
struct xhci_plat_priv {
|
||||
const char *firmware_name;
|
||||
unsigned long long quirks;
|
||||
- int (*plat_setup)(struct usb_hcd *);
|
||||
void (*plat_start)(struct usb_hcd *);
|
||||
int (*init_quirk)(struct usb_hcd *);
|
||||
int (*suspend_quirk)(struct usb_hcd *);
|
@ -0,0 +1,39 @@
|
||||
From 9a4556dad7bd0a6b8339cb72e169f5c76f2af6f1 Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Pali=20Roh=C3=A1r?= <pali@kernel.org>
|
||||
Date: Thu, 3 Feb 2022 22:44:44 +0100
|
||||
Subject: [PATCH 3/3] Revert "PCI: aardvark: Fix initialization with old
|
||||
Marvell's Arm Trusted Firmware"
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
This reverts commit b0c6ae0f8948a2be6bf4e8b4bbab9ca1343289b6.
|
||||
|
||||
Armada 3720 phy driver (phy-mvebu-a3700-comphy.c) does not return
|
||||
-EOPNOTSUPP from phy_power_on() callback anymore.
|
||||
|
||||
So remove dead code which handles -EOPNOTSUPP return value.
|
||||
|
||||
Signed-off-by: Pali Rohár <pali@kernel.org>
|
||||
Signed-off-by: Marek Behún <kabel@kernel.org>
|
||||
Acked-by: Miquel Raynal <miquel.raynal@bootlin.com>
|
||||
Acked-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
|
||||
Link: https://lore.kernel.org/r/20220203214444.1508-6-kabel@kernel.org
|
||||
Signed-off-by: Vinod Koul <vkoul@kernel.org>
|
||||
---
|
||||
drivers/pci/controller/pci-aardvark.c | 4 +---
|
||||
1 file changed, 1 insertion(+), 3 deletions(-)
|
||||
|
||||
--- a/drivers/pci/controller/pci-aardvark.c
|
||||
+++ b/drivers/pci/controller/pci-aardvark.c
|
||||
@@ -1642,9 +1642,7 @@ static int advk_pcie_enable_phy(struct a
|
||||
}
|
||||
|
||||
ret = phy_power_on(pcie->phy);
|
||||
- if (ret == -EOPNOTSUPP) {
|
||||
- dev_warn(&pcie->pdev->dev, "PHY unsupported by firmware\n");
|
||||
- } else if (ret) {
|
||||
+ if (ret) {
|
||||
phy_exit(pcie->phy);
|
||||
return ret;
|
||||
}
|
@ -0,0 +1,194 @@
|
||||
From 0a6fc70d76bddf98278af2ac000379c82aec8f11 Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Pali=20Roh=C3=A1r?= <pali@kernel.org>
|
||||
Date: Mon, 29 Aug 2022 10:30:46 +0200
|
||||
Subject: [PATCH] phy: marvell: phy-mvebu-a3700-comphy: Remove broken reset
|
||||
support
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
Reset support for SATA PHY is somehow broken and after calling it, kernel
|
||||
is not able to detect and initialize SATA disk Samsung SSD 850 EMT0 [1].
|
||||
|
||||
Reset support was introduced in commit 934337080c6c ("phy: marvell:
|
||||
phy-mvebu-a3700-comphy: Add native kernel implementation") as part of
|
||||
complete rewrite of this driver. v1 patch series of that commit [2] did
|
||||
not contain reset support and was tested that is working fine with
|
||||
Ethernet, SATA and USB PHYs without issues too.
|
||||
|
||||
So for now remove broken reset support and change implementation of
|
||||
power_off callback to power off all functions on specified lane (and not
|
||||
only selected function) because during startup kernel does not know which
|
||||
function was selected and configured by bootloader. Same logic was used
|
||||
also in v1 patch series of that commit.
|
||||
|
||||
This change fixes issues with initialization of SATA disk Samsung SSD 850
|
||||
and disk is working again, like before mentioned commit.
|
||||
|
||||
Once problem with PHY reset callback is solved its functionality could be
|
||||
re-introduced. But for now it is unknown why it does not work.
|
||||
|
||||
[1] - https://lore.kernel.org/r/20220531124159.3e4lgn2v462irbtz@shindev/
|
||||
[2] - https://lore.kernel.org/r/20211028184242.22105-1-kabel@kernel.org/
|
||||
|
||||
Reported-by: Shinichiro Kawasaki <shinichiro.kawasaki@wdc.com>
|
||||
Fixes: 934337080c6c ("phy: marvell: phy-mvebu-a3700-comphy: Add native kernel implementation")
|
||||
Cc: stable@vger.kernel.org # v5.18+
|
||||
Signed-off-by: Pali Rohár <pali@kernel.org>
|
||||
Tested-by: Shinichiro Kawasaki <shinichiro.kawasaki@wdc.com>
|
||||
Link: https://lore.kernel.org/r/20220829083046.15082-1-pali@kernel.org
|
||||
Signed-off-by: Vinod Koul <vkoul@kernel.org>
|
||||
---
|
||||
drivers/phy/marvell/phy-mvebu-a3700-comphy.c | 87 ++++----------------
|
||||
1 file changed, 17 insertions(+), 70 deletions(-)
|
||||
|
||||
--- a/drivers/phy/marvell/phy-mvebu-a3700-comphy.c
|
||||
+++ b/drivers/phy/marvell/phy-mvebu-a3700-comphy.c
|
||||
@@ -274,7 +274,6 @@ struct mvebu_a3700_comphy_lane {
|
||||
int submode;
|
||||
bool invert_tx;
|
||||
bool invert_rx;
|
||||
- bool needs_reset;
|
||||
};
|
||||
|
||||
struct gbe_phy_init_data_fix {
|
||||
@@ -1097,40 +1096,12 @@ mvebu_a3700_comphy_pcie_power_off(struct
|
||||
0x0, PU_PLL_BIT | PU_RX_BIT | PU_TX_BIT);
|
||||
}
|
||||
|
||||
-static int mvebu_a3700_comphy_reset(struct phy *phy)
|
||||
+static void mvebu_a3700_comphy_usb3_power_off(struct mvebu_a3700_comphy_lane *lane)
|
||||
{
|
||||
- struct mvebu_a3700_comphy_lane *lane = phy_get_drvdata(phy);
|
||||
- u16 mask, data;
|
||||
-
|
||||
- dev_dbg(lane->dev, "resetting lane %d\n", lane->id);
|
||||
-
|
||||
- /* COMPHY reset for internal logic */
|
||||
- comphy_lane_reg_set(lane, COMPHY_SFT_RESET,
|
||||
- SFT_RST_NO_REG, SFT_RST_NO_REG);
|
||||
-
|
||||
- /* COMPHY register reset (cleared automatically) */
|
||||
- comphy_lane_reg_set(lane, COMPHY_SFT_RESET, SFT_RST, SFT_RST);
|
||||
-
|
||||
- /* PIPE soft and register reset */
|
||||
- data = PIPE_SOFT_RESET | PIPE_REG_RESET;
|
||||
- mask = data;
|
||||
- comphy_lane_reg_set(lane, COMPHY_PIPE_RST_CLK_CTRL, data, mask);
|
||||
-
|
||||
- /* Release PIPE register reset */
|
||||
- comphy_lane_reg_set(lane, COMPHY_PIPE_RST_CLK_CTRL,
|
||||
- 0x0, PIPE_REG_RESET);
|
||||
-
|
||||
- /* Reset SB configuration register (only for lanes 0 and 1) */
|
||||
- if (lane->id == 0 || lane->id == 1) {
|
||||
- u32 mask, data;
|
||||
-
|
||||
- data = PIN_RESET_CORE_BIT | PIN_RESET_COMPHY_BIT |
|
||||
- PIN_PU_PLL_BIT | PIN_PU_RX_BIT | PIN_PU_TX_BIT;
|
||||
- mask = data | PIN_PU_IVREF_BIT | PIN_TX_IDLE_BIT;
|
||||
- comphy_periph_reg_set(lane, COMPHY_PHY_CFG1, data, mask);
|
||||
- }
|
||||
-
|
||||
- return 0;
|
||||
+ /*
|
||||
+ * The USB3 MAC sets the USB3 PHY to low state, so we do not
|
||||
+ * need to power off USB3 PHY again.
|
||||
+ */
|
||||
}
|
||||
|
||||
static bool mvebu_a3700_comphy_check_mode(int lane,
|
||||
@@ -1171,10 +1142,6 @@ static int mvebu_a3700_comphy_set_mode(s
|
||||
(lane->mode != mode || lane->submode != submode))
|
||||
return -EBUSY;
|
||||
|
||||
- /* If changing mode, ensure reset is called */
|
||||
- if (lane->mode != PHY_MODE_INVALID && lane->mode != mode)
|
||||
- lane->needs_reset = true;
|
||||
-
|
||||
/* Just remember the mode, ->power_on() will do the real setup */
|
||||
lane->mode = mode;
|
||||
lane->submode = submode;
|
||||
@@ -1185,7 +1152,6 @@ static int mvebu_a3700_comphy_set_mode(s
|
||||
static int mvebu_a3700_comphy_power_on(struct phy *phy)
|
||||
{
|
||||
struct mvebu_a3700_comphy_lane *lane = phy_get_drvdata(phy);
|
||||
- int ret;
|
||||
|
||||
if (!mvebu_a3700_comphy_check_mode(lane->id, lane->mode,
|
||||
lane->submode)) {
|
||||
@@ -1193,14 +1159,6 @@ static int mvebu_a3700_comphy_power_on(s
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
- if (lane->needs_reset) {
|
||||
- ret = mvebu_a3700_comphy_reset(phy);
|
||||
- if (ret)
|
||||
- return ret;
|
||||
-
|
||||
- lane->needs_reset = false;
|
||||
- }
|
||||
-
|
||||
switch (lane->mode) {
|
||||
case PHY_MODE_USB_HOST_SS:
|
||||
dev_dbg(lane->dev, "set lane %d to USB3 host mode\n", lane->id);
|
||||
@@ -1224,38 +1182,28 @@ static int mvebu_a3700_comphy_power_off(
|
||||
{
|
||||
struct mvebu_a3700_comphy_lane *lane = phy_get_drvdata(phy);
|
||||
|
||||
- switch (lane->mode) {
|
||||
- case PHY_MODE_USB_HOST_SS:
|
||||
- /*
|
||||
- * The USB3 MAC sets the USB3 PHY to low state, so we do not
|
||||
- * need to power off USB3 PHY again.
|
||||
- */
|
||||
- break;
|
||||
-
|
||||
- case PHY_MODE_SATA:
|
||||
- mvebu_a3700_comphy_sata_power_off(lane);
|
||||
- break;
|
||||
-
|
||||
- case PHY_MODE_ETHERNET:
|
||||
+ switch (lane->id) {
|
||||
+ case 0:
|
||||
+ mvebu_a3700_comphy_usb3_power_off(lane);
|
||||
mvebu_a3700_comphy_ethernet_power_off(lane);
|
||||
- break;
|
||||
-
|
||||
- case PHY_MODE_PCIE:
|
||||
+ return 0;
|
||||
+ case 1:
|
||||
mvebu_a3700_comphy_pcie_power_off(lane);
|
||||
- break;
|
||||
-
|
||||
+ mvebu_a3700_comphy_ethernet_power_off(lane);
|
||||
+ return 0;
|
||||
+ case 2:
|
||||
+ mvebu_a3700_comphy_usb3_power_off(lane);
|
||||
+ mvebu_a3700_comphy_sata_power_off(lane);
|
||||
+ return 0;
|
||||
default:
|
||||
dev_err(lane->dev, "invalid COMPHY mode\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
-
|
||||
- return 0;
|
||||
}
|
||||
|
||||
static const struct phy_ops mvebu_a3700_comphy_ops = {
|
||||
.power_on = mvebu_a3700_comphy_power_on,
|
||||
.power_off = mvebu_a3700_comphy_power_off,
|
||||
- .reset = mvebu_a3700_comphy_reset,
|
||||
.set_mode = mvebu_a3700_comphy_set_mode,
|
||||
.owner = THIS_MODULE,
|
||||
};
|
||||
@@ -1393,8 +1341,7 @@ static int mvebu_a3700_comphy_probe(stru
|
||||
* To avoid relying on the bootloader/firmware configuration,
|
||||
* power off all comphys.
|
||||
*/
|
||||
- mvebu_a3700_comphy_reset(phy);
|
||||
- lane->needs_reset = false;
|
||||
+ mvebu_a3700_comphy_power_off(phy);
|
||||
}
|
||||
|
||||
provider = devm_of_phy_provider_register(&pdev->dev,
|
@ -0,0 +1,90 @@
|
||||
From 86fc59ef818beb0e1945d17f8e734898baba7e4e Mon Sep 17 00:00:00 2001
|
||||
From: Colin Foster <colin.foster@in-advantage.com>
|
||||
Date: Sun, 13 Mar 2022 15:45:23 -0700
|
||||
Subject: [PATCH 1/2] regmap: add configurable downshift for addresses
|
||||
|
||||
Add an additional reg_downshift to be applied to register addresses before
|
||||
any register accesses. An example of a device that uses this is a VSC7514
|
||||
chip, which require each register address to be downshifted by two if the
|
||||
access is performed over a SPI bus.
|
||||
|
||||
Signed-off-by: Colin Foster <colin.foster@in-advantage.com>
|
||||
Link: https://lore.kernel.org/r/20220313224524.399947-2-colin.foster@in-advantage.com
|
||||
Signed-off-by: Mark Brown <broonie@kernel.org>
|
||||
---
|
||||
drivers/base/regmap/internal.h | 1 +
|
||||
drivers/base/regmap/regmap.c | 5 +++++
|
||||
include/linux/regmap.h | 3 +++
|
||||
3 files changed, 9 insertions(+)
|
||||
|
||||
--- a/drivers/base/regmap/internal.h
|
||||
+++ b/drivers/base/regmap/internal.h
|
||||
@@ -31,6 +31,7 @@ struct regmap_format {
|
||||
size_t buf_size;
|
||||
size_t reg_bytes;
|
||||
size_t pad_bytes;
|
||||
+ size_t reg_downshift;
|
||||
size_t val_bytes;
|
||||
void (*format_write)(struct regmap *map,
|
||||
unsigned int reg, unsigned int val);
|
||||
--- a/drivers/base/regmap/regmap.c
|
||||
+++ b/drivers/base/regmap/regmap.c
|
||||
@@ -823,6 +823,7 @@ struct regmap *__regmap_init(struct devi
|
||||
|
||||
map->format.reg_bytes = DIV_ROUND_UP(config->reg_bits, 8);
|
||||
map->format.pad_bytes = config->pad_bits / 8;
|
||||
+ map->format.reg_downshift = config->reg_downshift;
|
||||
map->format.val_bytes = DIV_ROUND_UP(config->val_bits, 8);
|
||||
map->format.buf_size = DIV_ROUND_UP(config->reg_bits +
|
||||
config->val_bits + config->pad_bits, 8);
|
||||
@@ -1735,6 +1736,7 @@ static int _regmap_raw_write_impl(struct
|
||||
return ret;
|
||||
}
|
||||
|
||||
+ reg >>= map->format.reg_downshift;
|
||||
map->format.format_reg(map->work_buf, reg, map->reg_shift);
|
||||
regmap_set_work_buf_flag_mask(map, map->format.reg_bytes,
|
||||
map->write_flag_mask);
|
||||
@@ -1905,6 +1907,7 @@ static int _regmap_bus_formatted_write(v
|
||||
return ret;
|
||||
}
|
||||
|
||||
+ reg >>= map->format.reg_downshift;
|
||||
map->format.format_write(map, reg, val);
|
||||
|
||||
trace_regmap_hw_write_start(map, reg, 1);
|
||||
@@ -2346,6 +2349,7 @@ static int _regmap_raw_multi_reg_write(s
|
||||
unsigned int reg = regs[i].reg;
|
||||
unsigned int val = regs[i].def;
|
||||
trace_regmap_hw_write_start(map, reg, 1);
|
||||
+ reg >>= map->format.reg_downshift;
|
||||
map->format.format_reg(u8, reg, map->reg_shift);
|
||||
u8 += reg_bytes + pad_bytes;
|
||||
map->format.format_val(u8, val, 0);
|
||||
@@ -2673,6 +2677,7 @@ static int _regmap_raw_read(struct regma
|
||||
return ret;
|
||||
}
|
||||
|
||||
+ reg >>= map->format.reg_downshift;
|
||||
map->format.format_reg(map->work_buf, reg, map->reg_shift);
|
||||
regmap_set_work_buf_flag_mask(map, map->format.reg_bytes,
|
||||
map->read_flag_mask);
|
||||
--- a/include/linux/regmap.h
|
||||
+++ b/include/linux/regmap.h
|
||||
@@ -237,6 +237,8 @@ typedef void (*regmap_unlock)(void *);
|
||||
* @reg_stride: The register address stride. Valid register addresses are a
|
||||
* multiple of this value. If set to 0, a value of 1 will be
|
||||
* used.
|
||||
+ * @reg_downshift: The number of bits to downshift the register before
|
||||
+ * performing any operations.
|
||||
* @pad_bits: Number of bits of padding between register and value.
|
||||
* @val_bits: Number of bits in a register value, mandatory.
|
||||
*
|
||||
@@ -360,6 +362,7 @@ struct regmap_config {
|
||||
|
||||
int reg_bits;
|
||||
int reg_stride;
|
||||
+ int reg_downshift;
|
||||
int pad_bits;
|
||||
int val_bits;
|
||||
|
@ -0,0 +1,95 @@
|
||||
From 0074f3f2b1e43d3cedd97e47fb6980db6d2ba79e Mon Sep 17 00:00:00 2001
|
||||
From: Colin Foster <colin.foster@in-advantage.com>
|
||||
Date: Sun, 13 Mar 2022 15:45:24 -0700
|
||||
Subject: [PATCH 2/2] regmap: allow a defined reg_base to be added to every
|
||||
address
|
||||
|
||||
There's an inconsistency that arises when a register set can be accessed
|
||||
internally via MMIO, or externally via SPI. The VSC7514 chip allows both
|
||||
modes of operation. When internally accessed, the system utilizes __iomem,
|
||||
devm_ioremap_resource, and devm_regmap_init_mmio.
|
||||
|
||||
For SPI it isn't possible to utilize memory-mapped IO. To properly operate,
|
||||
the resource base must be added to the register before every operation.
|
||||
|
||||
Signed-off-by: Colin Foster <colin.foster@in-advantage.com>
|
||||
Link: https://lore.kernel.org/r/20220313224524.399947-3-colin.foster@in-advantage.com
|
||||
Signed-off-by: Mark Brown <broonie@kernel.org>
|
||||
---
|
||||
drivers/base/regmap/internal.h | 1 +
|
||||
drivers/base/regmap/regmap.c | 6 ++++++
|
||||
include/linux/regmap.h | 3 +++
|
||||
3 files changed, 10 insertions(+)
|
||||
|
||||
--- a/drivers/base/regmap/internal.h
|
||||
+++ b/drivers/base/regmap/internal.h
|
||||
@@ -63,6 +63,7 @@ struct regmap {
|
||||
regmap_unlock unlock;
|
||||
void *lock_arg; /* This is passed to lock/unlock functions */
|
||||
gfp_t alloc_flags;
|
||||
+ unsigned int reg_base;
|
||||
|
||||
struct device *dev; /* Device we do I/O on */
|
||||
void *work_buf; /* Scratch buffer used to format I/O */
|
||||
--- a/drivers/base/regmap/regmap.c
|
||||
+++ b/drivers/base/regmap/regmap.c
|
||||
@@ -821,6 +821,8 @@ struct regmap *__regmap_init(struct devi
|
||||
else
|
||||
map->alloc_flags = GFP_KERNEL;
|
||||
|
||||
+ map->reg_base = config->reg_base;
|
||||
+
|
||||
map->format.reg_bytes = DIV_ROUND_UP(config->reg_bits, 8);
|
||||
map->format.pad_bytes = config->pad_bits / 8;
|
||||
map->format.reg_downshift = config->reg_downshift;
|
||||
@@ -1736,6 +1738,7 @@ static int _regmap_raw_write_impl(struct
|
||||
return ret;
|
||||
}
|
||||
|
||||
+ reg += map->reg_base;
|
||||
reg >>= map->format.reg_downshift;
|
||||
map->format.format_reg(map->work_buf, reg, map->reg_shift);
|
||||
regmap_set_work_buf_flag_mask(map, map->format.reg_bytes,
|
||||
@@ -1907,6 +1910,7 @@ static int _regmap_bus_formatted_write(v
|
||||
return ret;
|
||||
}
|
||||
|
||||
+ reg += map->reg_base;
|
||||
reg >>= map->format.reg_downshift;
|
||||
map->format.format_write(map, reg, val);
|
||||
|
||||
@@ -2349,6 +2353,7 @@ static int _regmap_raw_multi_reg_write(s
|
||||
unsigned int reg = regs[i].reg;
|
||||
unsigned int val = regs[i].def;
|
||||
trace_regmap_hw_write_start(map, reg, 1);
|
||||
+ reg += map->reg_base;
|
||||
reg >>= map->format.reg_downshift;
|
||||
map->format.format_reg(u8, reg, map->reg_shift);
|
||||
u8 += reg_bytes + pad_bytes;
|
||||
@@ -2677,6 +2682,7 @@ static int _regmap_raw_read(struct regma
|
||||
return ret;
|
||||
}
|
||||
|
||||
+ reg += map->reg_base;
|
||||
reg >>= map->format.reg_downshift;
|
||||
map->format.format_reg(map->work_buf, reg, map->reg_shift);
|
||||
regmap_set_work_buf_flag_mask(map, map->format.reg_bytes,
|
||||
--- a/include/linux/regmap.h
|
||||
+++ b/include/linux/regmap.h
|
||||
@@ -239,6 +239,8 @@ typedef void (*regmap_unlock)(void *);
|
||||
* used.
|
||||
* @reg_downshift: The number of bits to downshift the register before
|
||||
* performing any operations.
|
||||
+ * @reg_base: Value to be added to every register address before performing any
|
||||
+ * operation.
|
||||
* @pad_bits: Number of bits of padding between register and value.
|
||||
* @val_bits: Number of bits in a register value, mandatory.
|
||||
*
|
||||
@@ -363,6 +365,7 @@ struct regmap_config {
|
||||
int reg_bits;
|
||||
int reg_stride;
|
||||
int reg_downshift;
|
||||
+ unsigned int reg_base;
|
||||
int pad_bits;
|
||||
int val_bits;
|
||||
|
@ -0,0 +1,57 @@
|
||||
From 697c3892d825fb78f42ec8e53bed065dd728db3e Mon Sep 17 00:00:00 2001
|
||||
From: Daniel Golle <daniel@makrotopia.org>
|
||||
Date: Mon, 30 Jan 2023 02:04:57 +0000
|
||||
Subject: [PATCH] regmap: apply reg_base and reg_downshift for single register
|
||||
ops
|
||||
|
||||
reg_base and reg_downshift currently don't have any effect if used with
|
||||
a regmap_bus or regmap_config which only offers single register
|
||||
operations (ie. reg_read, reg_write and optionally reg_update_bits).
|
||||
|
||||
Fix that and take them into account also for regmap_bus with only
|
||||
reg_read and read_write operations by applying reg_base and
|
||||
reg_downshift in _regmap_bus_reg_write, _regmap_bus_reg_read.
|
||||
|
||||
Also apply reg_base and reg_downshift in _regmap_update_bits, but only
|
||||
in case the operation is carried out with a reg_update_bits call
|
||||
defined in either regmap_bus or regmap_config.
|
||||
|
||||
Fixes: 0074f3f2b1e43d ("regmap: allow a defined reg_base to be added to every address")
|
||||
Fixes: 86fc59ef818beb ("regmap: add configurable downshift for addresses")
|
||||
Signed-off-by: Daniel Golle <daniel@makrotopia.org>
|
||||
Tested-by: Colin Foster <colin.foster@in-advantage.com>
|
||||
Link: https://lore.kernel.org/r/Y9clyVS3tQEHlUhA@makrotopia.org
|
||||
Signed-off-by: Mark Brown <broonie@kernel.org>
|
||||
---
|
||||
drivers/base/regmap/regmap.c | 6 ++++++
|
||||
1 file changed, 6 insertions(+)
|
||||
|
||||
--- a/drivers/base/regmap/regmap.c
|
||||
+++ b/drivers/base/regmap/regmap.c
|
||||
@@ -1929,6 +1929,8 @@ static int _regmap_bus_reg_write(void *c
|
||||
{
|
||||
struct regmap *map = context;
|
||||
|
||||
+ reg += map->reg_base;
|
||||
+ reg >>= map->format.reg_downshift;
|
||||
return map->bus->reg_write(map->bus_context, reg, val);
|
||||
}
|
||||
|
||||
@@ -2703,6 +2705,8 @@ static int _regmap_bus_reg_read(void *co
|
||||
{
|
||||
struct regmap *map = context;
|
||||
|
||||
+ reg += map->reg_base;
|
||||
+ reg >>= map->format.reg_downshift;
|
||||
return map->bus->reg_read(map->bus_context, reg, val);
|
||||
}
|
||||
|
||||
@@ -3078,6 +3082,8 @@ static int _regmap_update_bits(struct re
|
||||
*change = false;
|
||||
|
||||
if (regmap_volatile(map, reg) && map->reg_update_bits) {
|
||||
+ reg += map->reg_base;
|
||||
+ reg >>= map->format.reg_downshift;
|
||||
ret = map->reg_update_bits(map->bus_context, reg, mask, val);
|
||||
if (ret == 0 && change)
|
||||
*change = true;
|
@ -0,0 +1,72 @@
|
||||
From bcdf0315a61a29eb753a607d3a85a4032de72d94 Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= <rafal@milecki.pl>
|
||||
Date: Tue, 10 May 2022 15:12:59 +0200
|
||||
Subject: [PATCH] mtd: call of_platform_populate() for MTD partitions
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
Until this change MTD subsystem supported handling partitions only with
|
||||
MTD partitions parsers. That's a specific / limited API designed around
|
||||
partitions.
|
||||
|
||||
Some MTD partitions may however require different handling. They may
|
||||
contain specific data that needs to be parsed and somehow extracted. For
|
||||
that purpose MTD subsystem should allow binding of standard platform
|
||||
drivers.
|
||||
|
||||
An example can be U-Boot (sub)partition with environment variables.
|
||||
There exist a "u-boot,env" DT binding for MTD (sub)partition that
|
||||
requires an NVMEM driver.
|
||||
|
||||
Ref: 5db1c2dbc04c ("dt-bindings: nvmem: add U-Boot environment variables binding")
|
||||
Signed-off-by: Rafał Miłecki <rafal@milecki.pl>
|
||||
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
|
||||
Link: https://lore.kernel.org/linux-mtd/20220510131259.555-1-zajec5@gmail.com
|
||||
---
|
||||
drivers/mtd/mtdpart.c | 9 +++++++++
|
||||
1 file changed, 9 insertions(+)
|
||||
|
||||
--- a/drivers/mtd/mtdpart.c
|
||||
+++ b/drivers/mtd/mtdpart.c
|
||||
@@ -17,6 +17,7 @@
|
||||
#include <linux/mtd/partitions.h>
|
||||
#include <linux/err.h>
|
||||
#include <linux/of.h>
|
||||
+#include <linux/of_platform.h>
|
||||
|
||||
#include "mtdcore.h"
|
||||
|
||||
@@ -577,10 +578,16 @@ static int mtd_part_of_parse(struct mtd_
|
||||
struct mtd_part_parser *parser;
|
||||
struct device_node *np;
|
||||
struct property *prop;
|
||||
+ struct device *dev;
|
||||
const char *compat;
|
||||
const char *fixed = "fixed-partitions";
|
||||
int ret, err = 0;
|
||||
|
||||
+ dev = &master->dev;
|
||||
+ /* Use parent device (controller) if the top level MTD is not registered */
|
||||
+ if (!IS_ENABLED(CONFIG_MTD_PARTITIONED_MASTER) && !mtd_is_partition(master))
|
||||
+ dev = master->dev.parent;
|
||||
+
|
||||
np = mtd_get_of_node(master);
|
||||
if (mtd_is_partition(master))
|
||||
of_node_get(np);
|
||||
@@ -593,6 +600,7 @@ static int mtd_part_of_parse(struct mtd_
|
||||
continue;
|
||||
ret = mtd_part_do_parse(parser, master, pparts, NULL);
|
||||
if (ret > 0) {
|
||||
+ of_platform_populate(np, NULL, NULL, dev);
|
||||
of_node_put(np);
|
||||
return ret;
|
||||
}
|
||||
@@ -600,6 +608,7 @@ static int mtd_part_of_parse(struct mtd_
|
||||
if (ret < 0 && !err)
|
||||
err = ret;
|
||||
}
|
||||
+ of_platform_populate(np, NULL, NULL, dev);
|
||||
of_node_put(np);
|
||||
|
||||
/*
|
@ -0,0 +1,302 @@
|
||||
From 9b78ef0c7997052e9eaa0f7a4513d546fa17358c Mon Sep 17 00:00:00 2001
|
||||
From: Mikhail Zhilkin <csharper2005@gmail.com>
|
||||
Date: Sun, 29 May 2022 11:07:14 +0000
|
||||
Subject: [PATCH] mtd: parsers: add support for Sercomm partitions
|
||||
|
||||
This adds an MTD partition parser for the Sercomm partition table that
|
||||
is used in some Beeline, Netgear and Sercomm routers.
|
||||
|
||||
The Sercomm partition map table contains real partition offsets, which
|
||||
may differ from device to device depending on the number and location of
|
||||
bad blocks on NAND.
|
||||
|
||||
Original patch (proposed by NOGUCHI Hiroshi):
|
||||
Link: https://github.com/openwrt/openwrt/pull/1318#issuecomment-420607394
|
||||
|
||||
Signed-off-by: NOGUCHI Hiroshi <drvlabo@gmail.com>
|
||||
Signed-off-by: Mikhail Zhilkin <csharper2005@gmail.com>
|
||||
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
|
||||
Link: https://lore.kernel.org/linux-mtd/20220529110714.189732-1-csharper2005@gmail.com
|
||||
---
|
||||
drivers/mtd/parsers/Kconfig | 9 ++
|
||||
drivers/mtd/parsers/Makefile | 1 +
|
||||
drivers/mtd/parsers/scpart.c | 248 +++++++++++++++++++++++++++++++++++
|
||||
3 files changed, 258 insertions(+)
|
||||
create mode 100644 drivers/mtd/parsers/scpart.c
|
||||
|
||||
--- a/drivers/mtd/parsers/Kconfig
|
||||
+++ b/drivers/mtd/parsers/Kconfig
|
||||
@@ -186,3 +186,12 @@ config MTD_QCOMSMEM_PARTS
|
||||
help
|
||||
This provides support for parsing partitions from Shared Memory (SMEM)
|
||||
for NAND and SPI flash on Qualcomm platforms.
|
||||
+
|
||||
+config MTD_SERCOMM_PARTS
|
||||
+ tristate "Sercomm partition table parser"
|
||||
+ depends on MTD && RALINK
|
||||
+ help
|
||||
+ This provides partitions table parser for devices with Sercomm
|
||||
+ partition map. This partition table contains real partition
|
||||
+ offsets, which may differ from device to device depending on the
|
||||
+ number and location of bad blocks on NAND.
|
||||
--- a/drivers/mtd/parsers/Makefile
|
||||
+++ b/drivers/mtd/parsers/Makefile
|
||||
@@ -10,6 +10,7 @@ ofpart-$(CONFIG_MTD_OF_PARTS_LINKSYS_NS)
|
||||
obj-$(CONFIG_MTD_PARSER_IMAGETAG) += parser_imagetag.o
|
||||
obj-$(CONFIG_MTD_AFS_PARTS) += afs.o
|
||||
obj-$(CONFIG_MTD_PARSER_TRX) += parser_trx.o
|
||||
+obj-$(CONFIG_MTD_SERCOMM_PARTS) += scpart.o
|
||||
obj-$(CONFIG_MTD_SHARPSL_PARTS) += sharpslpart.o
|
||||
obj-$(CONFIG_MTD_REDBOOT_PARTS) += redboot.o
|
||||
obj-$(CONFIG_MTD_QCOMSMEM_PARTS) += qcomsmempart.o
|
||||
--- /dev/null
|
||||
+++ b/drivers/mtd/parsers/scpart.c
|
||||
@@ -0,0 +1,248 @@
|
||||
+// SPDX-License-Identifier: GPL-2.0-or-later
|
||||
+/*
|
||||
+ * drivers/mtd/scpart.c: Sercomm Partition Parser
|
||||
+ *
|
||||
+ * Copyright (C) 2018 NOGUCHI Hiroshi
|
||||
+ * Copyright (C) 2022 Mikhail Zhilkin
|
||||
+ */
|
||||
+
|
||||
+#include <linux/kernel.h>
|
||||
+#include <linux/slab.h>
|
||||
+#include <linux/mtd/mtd.h>
|
||||
+#include <linux/mtd/partitions.h>
|
||||
+#include <linux/module.h>
|
||||
+
|
||||
+#define MOD_NAME "scpart"
|
||||
+
|
||||
+#ifdef pr_fmt
|
||||
+#undef pr_fmt
|
||||
+#endif
|
||||
+
|
||||
+#define pr_fmt(fmt) MOD_NAME ": " fmt
|
||||
+
|
||||
+#define ID_ALREADY_FOUND 0xffffffffUL
|
||||
+
|
||||
+#define MAP_OFFS_IN_BLK 0x800
|
||||
+#define MAP_MIRROR_NUM 2
|
||||
+
|
||||
+static const char sc_part_magic[] = {
|
||||
+ 'S', 'C', 'F', 'L', 'M', 'A', 'P', 'O', 'K', '\0',
|
||||
+};
|
||||
+#define PART_MAGIC_LEN sizeof(sc_part_magic)
|
||||
+
|
||||
+/* assumes that all fields are set by CPU native endian */
|
||||
+struct sc_part_desc {
|
||||
+ uint32_t part_id;
|
||||
+ uint32_t part_offs;
|
||||
+ uint32_t part_bytes;
|
||||
+};
|
||||
+
|
||||
+static uint32_t scpart_desc_is_valid(struct sc_part_desc *pdesc)
|
||||
+{
|
||||
+ return ((pdesc->part_id != 0xffffffffUL) &&
|
||||
+ (pdesc->part_offs != 0xffffffffUL) &&
|
||||
+ (pdesc->part_bytes != 0xffffffffUL));
|
||||
+}
|
||||
+
|
||||
+static int scpart_scan_partmap(struct mtd_info *master, loff_t partmap_offs,
|
||||
+ struct sc_part_desc **ppdesc)
|
||||
+{
|
||||
+ int cnt = 0;
|
||||
+ int res = 0;
|
||||
+ int res2;
|
||||
+ loff_t offs;
|
||||
+ size_t retlen;
|
||||
+ struct sc_part_desc *pdesc = NULL;
|
||||
+ struct sc_part_desc *tmpdesc;
|
||||
+ uint8_t *buf;
|
||||
+
|
||||
+ buf = kzalloc(master->erasesize, GFP_KERNEL);
|
||||
+ if (!buf) {
|
||||
+ res = -ENOMEM;
|
||||
+ goto out;
|
||||
+ }
|
||||
+
|
||||
+ res2 = mtd_read(master, partmap_offs, master->erasesize, &retlen, buf);
|
||||
+ if (res2 || retlen != master->erasesize) {
|
||||
+ res = -EIO;
|
||||
+ goto free;
|
||||
+ }
|
||||
+
|
||||
+ for (offs = MAP_OFFS_IN_BLK;
|
||||
+ offs < master->erasesize - sizeof(*tmpdesc);
|
||||
+ offs += sizeof(*tmpdesc)) {
|
||||
+ tmpdesc = (struct sc_part_desc *)&buf[offs];
|
||||
+ if (!scpart_desc_is_valid(tmpdesc))
|
||||
+ break;
|
||||
+ cnt++;
|
||||
+ }
|
||||
+
|
||||
+ if (cnt > 0) {
|
||||
+ int bytes = cnt * sizeof(*pdesc);
|
||||
+
|
||||
+ pdesc = kcalloc(cnt, sizeof(*pdesc), GFP_KERNEL);
|
||||
+ if (!pdesc) {
|
||||
+ res = -ENOMEM;
|
||||
+ goto free;
|
||||
+ }
|
||||
+ memcpy(pdesc, &(buf[MAP_OFFS_IN_BLK]), bytes);
|
||||
+
|
||||
+ *ppdesc = pdesc;
|
||||
+ res = cnt;
|
||||
+ }
|
||||
+
|
||||
+free:
|
||||
+ kfree(buf);
|
||||
+
|
||||
+out:
|
||||
+ return res;
|
||||
+}
|
||||
+
|
||||
+static int scpart_find_partmap(struct mtd_info *master,
|
||||
+ struct sc_part_desc **ppdesc)
|
||||
+{
|
||||
+ int magic_found = 0;
|
||||
+ int res = 0;
|
||||
+ int res2;
|
||||
+ loff_t offs = 0;
|
||||
+ size_t retlen;
|
||||
+ uint8_t rdbuf[PART_MAGIC_LEN];
|
||||
+
|
||||
+ while ((magic_found < MAP_MIRROR_NUM) &&
|
||||
+ (offs < master->size) &&
|
||||
+ !mtd_block_isbad(master, offs)) {
|
||||
+ res2 = mtd_read(master, offs, PART_MAGIC_LEN, &retlen, rdbuf);
|
||||
+ if (res2 || retlen != PART_MAGIC_LEN) {
|
||||
+ res = -EIO;
|
||||
+ goto out;
|
||||
+ }
|
||||
+ if (!memcmp(rdbuf, sc_part_magic, PART_MAGIC_LEN)) {
|
||||
+ pr_debug("Signature found at 0x%llx\n", offs);
|
||||
+ magic_found++;
|
||||
+ res = scpart_scan_partmap(master, offs, ppdesc);
|
||||
+ if (res > 0)
|
||||
+ goto out;
|
||||
+ }
|
||||
+ offs += master->erasesize;
|
||||
+ }
|
||||
+
|
||||
+out:
|
||||
+ if (res > 0)
|
||||
+ pr_info("Valid 'SC PART MAP' (%d partitions) found at 0x%llx\n", res, offs);
|
||||
+ else
|
||||
+ pr_info("No valid 'SC PART MAP' was found\n");
|
||||
+
|
||||
+ return res;
|
||||
+}
|
||||
+
|
||||
+static int scpart_parse(struct mtd_info *master,
|
||||
+ const struct mtd_partition **pparts,
|
||||
+ struct mtd_part_parser_data *data)
|
||||
+{
|
||||
+ const char *partname;
|
||||
+ int n;
|
||||
+ int nr_scparts;
|
||||
+ int nr_parts = 0;
|
||||
+ int res = 0;
|
||||
+ struct sc_part_desc *scpart_map = NULL;
|
||||
+ struct mtd_partition *parts = NULL;
|
||||
+ struct device_node *mtd_node;
|
||||
+ struct device_node *ofpart_node;
|
||||
+ struct device_node *pp;
|
||||
+
|
||||
+ mtd_node = mtd_get_of_node(master);
|
||||
+ if (!mtd_node) {
|
||||
+ res = -ENOENT;
|
||||
+ goto out;
|
||||
+ }
|
||||
+
|
||||
+ ofpart_node = of_get_child_by_name(mtd_node, "partitions");
|
||||
+ if (!ofpart_node) {
|
||||
+ pr_info("%s: 'partitions' subnode not found on %pOF.\n",
|
||||
+ master->name, mtd_node);
|
||||
+ res = -ENOENT;
|
||||
+ goto out;
|
||||
+ }
|
||||
+
|
||||
+ nr_scparts = scpart_find_partmap(master, &scpart_map);
|
||||
+ if (nr_scparts <= 0) {
|
||||
+ pr_info("No any partitions was found in 'SC PART MAP'.\n");
|
||||
+ res = -ENOENT;
|
||||
+ goto free;
|
||||
+ }
|
||||
+
|
||||
+ parts = kcalloc(of_get_child_count(ofpart_node), sizeof(*parts),
|
||||
+ GFP_KERNEL);
|
||||
+ if (!parts) {
|
||||
+ res = -ENOMEM;
|
||||
+ goto free;
|
||||
+ }
|
||||
+
|
||||
+ for_each_child_of_node(ofpart_node, pp) {
|
||||
+ u32 scpart_id;
|
||||
+
|
||||
+ if (of_property_read_u32(pp, "sercomm,scpart-id", &scpart_id))
|
||||
+ continue;
|
||||
+
|
||||
+ for (n = 0 ; n < nr_scparts ; n++)
|
||||
+ if ((scpart_map[n].part_id != ID_ALREADY_FOUND) &&
|
||||
+ (scpart_id == scpart_map[n].part_id))
|
||||
+ break;
|
||||
+ if (n >= nr_scparts)
|
||||
+ /* not match */
|
||||
+ continue;
|
||||
+
|
||||
+ /* add the partition found in OF into MTD partition array */
|
||||
+ parts[nr_parts].offset = scpart_map[n].part_offs;
|
||||
+ parts[nr_parts].size = scpart_map[n].part_bytes;
|
||||
+ parts[nr_parts].of_node = pp;
|
||||
+
|
||||
+ if (!of_property_read_string(pp, "label", &partname))
|
||||
+ parts[nr_parts].name = partname;
|
||||
+ if (of_property_read_bool(pp, "read-only"))
|
||||
+ parts[nr_parts].mask_flags |= MTD_WRITEABLE;
|
||||
+ if (of_property_read_bool(pp, "lock"))
|
||||
+ parts[nr_parts].mask_flags |= MTD_POWERUP_LOCK;
|
||||
+
|
||||
+ /* mark as 'done' */
|
||||
+ scpart_map[n].part_id = ID_ALREADY_FOUND;
|
||||
+
|
||||
+ nr_parts++;
|
||||
+ }
|
||||
+
|
||||
+ if (nr_parts > 0) {
|
||||
+ *pparts = parts;
|
||||
+ res = nr_parts;
|
||||
+ } else
|
||||
+ pr_info("No partition in OF matches partition ID with 'SC PART MAP'.\n");
|
||||
+
|
||||
+ of_node_put(pp);
|
||||
+
|
||||
+free:
|
||||
+ kfree(scpart_map);
|
||||
+ if (res <= 0)
|
||||
+ kfree(parts);
|
||||
+
|
||||
+out:
|
||||
+ return res;
|
||||
+}
|
||||
+
|
||||
+static const struct of_device_id scpart_parser_of_match_table[] = {
|
||||
+ { .compatible = "sercomm,sc-partitions" },
|
||||
+ {},
|
||||
+};
|
||||
+MODULE_DEVICE_TABLE(of, scpart_parser_of_match_table);
|
||||
+
|
||||
+static struct mtd_part_parser scpart_parser = {
|
||||
+ .parse_fn = scpart_parse,
|
||||
+ .name = "scpart",
|
||||
+ .of_match_table = scpart_parser_of_match_table,
|
||||
+};
|
||||
+module_mtd_part_parser(scpart_parser);
|
||||
+
|
||||
+/* mtd parsers will request the module by parser name */
|
||||
+MODULE_ALIAS("scpart");
|
||||
+MODULE_LICENSE("GPL");
|
||||
+MODULE_AUTHOR("NOGUCHI Hiroshi <drvlabo@gmail.com>");
|
||||
+MODULE_AUTHOR("Mikhail Zhilkin <csharper2005@gmail.com>");
|
||||
+MODULE_DESCRIPTION("Sercomm partition parser");
|
@ -0,0 +1,106 @@
|
||||
From ad9b10d1eaada169bd764abcab58f08538877e26 Mon Sep 17 00:00:00 2001
|
||||
From: Christian Marangi <ansuelsmth@gmail.com>
|
||||
Date: Wed, 22 Jun 2022 03:06:28 +0200
|
||||
Subject: mtd: core: introduce of support for dynamic partitions
|
||||
|
||||
We have many parser that register mtd partitions at runtime. One example
|
||||
is the cmdlinepart or the smem-part parser where the compatible is defined
|
||||
in the dts and the partitions gets detected and registered by the
|
||||
parser. This is problematic for the NVMEM subsystem that requires an OF
|
||||
node to detect NVMEM cells.
|
||||
|
||||
To fix this problem, introduce an additional logic that will try to
|
||||
assign an OF node to the MTD if declared.
|
||||
|
||||
On MTD addition, it will be checked if the MTD has an OF node and if
|
||||
not declared will check if a partition with the same label / node name is
|
||||
declared in DTS. If an exact match is found, the partition dynamically
|
||||
allocated by the parser will have a connected OF node.
|
||||
|
||||
The NVMEM subsystem will detect the OF node and register any NVMEM cells
|
||||
declared statically in the DTS.
|
||||
|
||||
Signed-off-by: Christian Marangi <ansuelsmth@gmail.com>
|
||||
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
|
||||
Link: https://lore.kernel.org/linux-mtd/20220622010628.30414-4-ansuelsmth@gmail.com
|
||||
---
|
||||
drivers/mtd/mtdcore.c | 61 +++++++++++++++++++++++++++++++++++++++++++
|
||||
1 file changed, 61 insertions(+)
|
||||
|
||||
--- a/drivers/mtd/mtdcore.c
|
||||
+++ b/drivers/mtd/mtdcore.c
|
||||
@@ -564,6 +564,66 @@ static int mtd_nvmem_add(struct mtd_info
|
||||
return 0;
|
||||
}
|
||||
|
||||
+static void mtd_check_of_node(struct mtd_info *mtd)
|
||||
+{
|
||||
+ struct device_node *partitions, *parent_dn, *mtd_dn = NULL;
|
||||
+ const char *pname, *prefix = "partition-";
|
||||
+ int plen, mtd_name_len, offset, prefix_len;
|
||||
+ struct mtd_info *parent;
|
||||
+ bool found = false;
|
||||
+
|
||||
+ /* Check if MTD already has a device node */
|
||||
+ if (dev_of_node(&mtd->dev))
|
||||
+ return;
|
||||
+
|
||||
+ /* Check if a partitions node exist */
|
||||
+ parent = mtd->parent;
|
||||
+ parent_dn = dev_of_node(&parent->dev);
|
||||
+ if (!parent_dn)
|
||||
+ return;
|
||||
+
|
||||
+ partitions = of_get_child_by_name(parent_dn, "partitions");
|
||||
+ if (!partitions)
|
||||
+ goto exit_parent;
|
||||
+
|
||||
+ prefix_len = strlen(prefix);
|
||||
+ mtd_name_len = strlen(mtd->name);
|
||||
+
|
||||
+ /* Search if a partition is defined with the same name */
|
||||
+ for_each_child_of_node(partitions, mtd_dn) {
|
||||
+ offset = 0;
|
||||
+
|
||||
+ /* Skip partition with no/wrong prefix */
|
||||
+ if (!of_node_name_prefix(mtd_dn, "partition-"))
|
||||
+ continue;
|
||||
+
|
||||
+ /* Label have priority. Check that first */
|
||||
+ if (of_property_read_string(mtd_dn, "label", &pname)) {
|
||||
+ of_property_read_string(mtd_dn, "name", &pname);
|
||||
+ offset = prefix_len;
|
||||
+ }
|
||||
+
|
||||
+ plen = strlen(pname) - offset;
|
||||
+ if (plen == mtd_name_len &&
|
||||
+ !strncmp(mtd->name, pname + offset, plen)) {
|
||||
+ found = true;
|
||||
+ break;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ if (!found)
|
||||
+ goto exit_partitions;
|
||||
+
|
||||
+ /* Set of_node only for nvmem */
|
||||
+ if (of_device_is_compatible(mtd_dn, "nvmem-cells"))
|
||||
+ mtd_set_of_node(mtd, mtd_dn);
|
||||
+
|
||||
+exit_partitions:
|
||||
+ of_node_put(partitions);
|
||||
+exit_parent:
|
||||
+ of_node_put(parent_dn);
|
||||
+}
|
||||
+
|
||||
/**
|
||||
* add_mtd_device - register an MTD device
|
||||
* @mtd: pointer to new MTD device info structure
|
||||
@@ -669,6 +729,7 @@ int add_mtd_device(struct mtd_info *mtd)
|
||||
mtd->dev.devt = MTD_DEVT(i);
|
||||
dev_set_name(&mtd->dev, "mtd%d", i);
|
||||
dev_set_drvdata(&mtd->dev, mtd);
|
||||
+ mtd_check_of_node(mtd);
|
||||
of_node_get(mtd_get_of_node(mtd));
|
||||
error = device_register(&mtd->dev);
|
||||
if (error) {
|
@ -0,0 +1,72 @@
|
||||
From b0321721be50b80c03a51866a94fde4f94690e18 Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= <rafal@milecki.pl>
|
||||
Date: Wed, 15 Jun 2022 21:42:59 +0200
|
||||
Subject: [PATCH] mtd: allow getting MTD device associated with a specific DT
|
||||
node
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
MTD subsystem API allows interacting with MTD devices (e.g. reading,
|
||||
writing, handling bad blocks). So far a random driver could get MTD
|
||||
device only by its name (get_mtd_device_nm()). This change allows
|
||||
getting them also by a DT node.
|
||||
|
||||
This API is required for drivers handling DT defined MTD partitions in a
|
||||
specific way (e.g. U-Boot (sub)partition with environment variables).
|
||||
|
||||
Signed-off-by: Rafał Miłecki <rafal@milecki.pl>
|
||||
Acked-by: Miquel Raynal <miquel.raynal@bootlin.com>
|
||||
Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
|
||||
---
|
||||
drivers/mtd/mtdcore.c | 28 ++++++++++++++++++++++++++++
|
||||
include/linux/mtd/mtd.h | 1 +
|
||||
2 files changed, 29 insertions(+)
|
||||
|
||||
--- a/drivers/mtd/mtdcore.c
|
||||
+++ b/drivers/mtd/mtdcore.c
|
||||
@@ -1236,6 +1236,34 @@ int __get_mtd_device(struct mtd_info *mt
|
||||
EXPORT_SYMBOL_GPL(__get_mtd_device);
|
||||
|
||||
/**
|
||||
+ * of_get_mtd_device_by_node - obtain an MTD device associated with a given node
|
||||
+ *
|
||||
+ * @np: device tree node
|
||||
+ */
|
||||
+struct mtd_info *of_get_mtd_device_by_node(struct device_node *np)
|
||||
+{
|
||||
+ struct mtd_info *mtd = NULL;
|
||||
+ struct mtd_info *tmp;
|
||||
+ int err;
|
||||
+
|
||||
+ mutex_lock(&mtd_table_mutex);
|
||||
+
|
||||
+ err = -EPROBE_DEFER;
|
||||
+ mtd_for_each_device(tmp) {
|
||||
+ if (mtd_get_of_node(tmp) == np) {
|
||||
+ mtd = tmp;
|
||||
+ err = __get_mtd_device(mtd);
|
||||
+ break;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ mutex_unlock(&mtd_table_mutex);
|
||||
+
|
||||
+ return err ? ERR_PTR(err) : mtd;
|
||||
+}
|
||||
+EXPORT_SYMBOL_GPL(of_get_mtd_device_by_node);
|
||||
+
|
||||
+/**
|
||||
* get_mtd_device_nm - obtain a validated handle for an MTD device by
|
||||
* device name
|
||||
* @name: MTD device name to open
|
||||
--- a/include/linux/mtd/mtd.h
|
||||
+++ b/include/linux/mtd/mtd.h
|
||||
@@ -682,6 +682,7 @@ extern int mtd_device_unregister(struct
|
||||
extern struct mtd_info *get_mtd_device(struct mtd_info *mtd, int num);
|
||||
extern int __get_mtd_device(struct mtd_info *mtd);
|
||||
extern void __put_mtd_device(struct mtd_info *mtd);
|
||||
+extern struct mtd_info *of_get_mtd_device_by_node(struct device_node *np);
|
||||
extern struct mtd_info *get_mtd_device_nm(const char *name);
|
||||
extern void put_mtd_device(struct mtd_info *mtd);
|
||||
|
@ -0,0 +1,30 @@
|
||||
From 7ec4cdb321738d44ae5d405e7b6ac73dfbf99caa Mon Sep 17 00:00:00 2001
|
||||
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
|
||||
Date: Mon, 25 Jul 2022 22:49:25 +0900
|
||||
Subject: [PATCH] mtd: core: check partition before dereference
|
||||
|
||||
syzbot is reporting NULL pointer dereference at mtd_check_of_node() [1],
|
||||
for mtdram test device (CONFIG_MTD_MTDRAM) is not partition.
|
||||
|
||||
Link: https://syzkaller.appspot.com/bug?extid=fe013f55a2814a9e8cfd [1]
|
||||
Reported-by: syzbot <syzbot+fe013f55a2814a9e8cfd@syzkaller.appspotmail.com>
|
||||
Reported-by: kernel test robot <oliver.sang@intel.com>
|
||||
Fixes: ad9b10d1eaada169 ("mtd: core: introduce of support for dynamic partitions")
|
||||
Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
|
||||
CC: stable@vger.kernel.org
|
||||
Signed-off-by: Richard Weinberger <richard@nod.at>
|
||||
---
|
||||
drivers/mtd/mtdcore.c | 2 ++
|
||||
1 file changed, 2 insertions(+)
|
||||
|
||||
--- a/drivers/mtd/mtdcore.c
|
||||
+++ b/drivers/mtd/mtdcore.c
|
||||
@@ -577,6 +577,8 @@ static void mtd_check_of_node(struct mtd
|
||||
return;
|
||||
|
||||
/* Check if a partitions node exist */
|
||||
+ if (!mtd_is_partition(mtd))
|
||||
+ return;
|
||||
parent = mtd->parent;
|
||||
parent_dn = dev_of_node(&parent->dev);
|
||||
if (!parent_dn)
|
@ -0,0 +1,101 @@
|
||||
From 12b58961de0bd88b3c7dfa5d21f6d67f4678b780 Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= <rafal@milecki.pl>
|
||||
Date: Tue, 18 Oct 2022 07:18:22 +0200
|
||||
Subject: [PATCH] mtd: core: add missing of_node_get() in dynamic partitions
|
||||
code
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
This fixes unbalanced of_node_put():
|
||||
[ 1.078910] 6 cmdlinepart partitions found on MTD device gpmi-nand
|
||||
[ 1.085116] Creating 6 MTD partitions on "gpmi-nand":
|
||||
[ 1.090181] 0x000000000000-0x000008000000 : "nandboot"
|
||||
[ 1.096952] 0x000008000000-0x000009000000 : "nandfit"
|
||||
[ 1.103547] 0x000009000000-0x00000b000000 : "nandkernel"
|
||||
[ 1.110317] 0x00000b000000-0x00000c000000 : "nanddtb"
|
||||
[ 1.115525] ------------[ cut here ]------------
|
||||
[ 1.120141] refcount_t: addition on 0; use-after-free.
|
||||
[ 1.125328] WARNING: CPU: 0 PID: 1 at lib/refcount.c:25 refcount_warn_saturate+0xdc/0x148
|
||||
[ 1.133528] Modules linked in:
|
||||
[ 1.136589] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 6.0.0-rc7-next-20220930-04543-g8cf3f7
|
||||
[ 1.146342] Hardware name: Freescale i.MX8DXL DDR3L EVK (DT)
|
||||
[ 1.151999] pstate: 600000c5 (nZCv daIF -PAN -UAO -TCO -DIT -SSBS BTYPE=--)
|
||||
[ 1.158965] pc : refcount_warn_saturate+0xdc/0x148
|
||||
[ 1.163760] lr : refcount_warn_saturate+0xdc/0x148
|
||||
[ 1.168556] sp : ffff800009ddb080
|
||||
[ 1.171866] x29: ffff800009ddb080 x28: ffff800009ddb35a x27: 0000000000000002
|
||||
[ 1.179015] x26: ffff8000098b06ad x25: ffffffffffffffff x24: ffff0a00ffffff05
|
||||
[ 1.186165] x23: ffff00001fdf6470 x22: ffff800009ddb367 x21: 0000000000000000
|
||||
[ 1.193314] x20: ffff00001fdfebe8 x19: ffff00001fdfec50 x18: ffffffffffffffff
|
||||
[ 1.200464] x17: 0000000000000000 x16: 0000000000000118 x15: 0000000000000004
|
||||
[ 1.207614] x14: 0000000000000fff x13: ffff800009bca248 x12: 0000000000000003
|
||||
[ 1.214764] x11: 00000000ffffefff x10: c0000000ffffefff x9 : 4762cb2ccb52de00
|
||||
[ 1.221914] x8 : 4762cb2ccb52de00 x7 : 205d313431303231 x6 : 312e31202020205b
|
||||
[ 1.229063] x5 : ffff800009d55c1f x4 : 0000000000000001 x3 : 0000000000000000
|
||||
[ 1.236213] x2 : 0000000000000000 x1 : ffff800009954be6 x0 : 000000000000002a
|
||||
[ 1.243365] Call trace:
|
||||
[ 1.245806] refcount_warn_saturate+0xdc/0x148
|
||||
[ 1.250253] kobject_get+0x98/0x9c
|
||||
[ 1.253658] of_node_get+0x20/0x34
|
||||
[ 1.257072] of_fwnode_get+0x3c/0x54
|
||||
[ 1.260652] fwnode_get_nth_parent+0xd8/0xf4
|
||||
[ 1.264926] fwnode_full_name_string+0x3c/0xb4
|
||||
[ 1.269373] device_node_string+0x498/0x5b4
|
||||
[ 1.273561] pointer+0x41c/0x5d0
|
||||
[ 1.276793] vsnprintf+0x4d8/0x694
|
||||
[ 1.280198] vprintk_store+0x164/0x528
|
||||
[ 1.283951] vprintk_emit+0x98/0x164
|
||||
[ 1.287530] vprintk_default+0x44/0x6c
|
||||
[ 1.291284] vprintk+0xf0/0x134
|
||||
[ 1.294428] _printk+0x54/0x7c
|
||||
[ 1.297486] of_node_release+0xe8/0x128
|
||||
[ 1.301326] kobject_put+0x98/0xfc
|
||||
[ 1.304732] of_node_put+0x1c/0x28
|
||||
[ 1.308137] add_mtd_device+0x484/0x6d4
|
||||
[ 1.311977] add_mtd_partitions+0xf0/0x1d0
|
||||
[ 1.316078] parse_mtd_partitions+0x45c/0x518
|
||||
[ 1.320439] mtd_device_parse_register+0xb0/0x274
|
||||
[ 1.325147] gpmi_nand_probe+0x51c/0x650
|
||||
[ 1.329074] platform_probe+0xa8/0xd0
|
||||
[ 1.332740] really_probe+0x130/0x334
|
||||
[ 1.336406] __driver_probe_device+0xb4/0xe0
|
||||
[ 1.340681] driver_probe_device+0x3c/0x1f8
|
||||
[ 1.344869] __driver_attach+0xdc/0x1a4
|
||||
[ 1.348708] bus_for_each_dev+0x80/0xcc
|
||||
[ 1.352548] driver_attach+0x24/0x30
|
||||
[ 1.356127] bus_add_driver+0x108/0x1f4
|
||||
[ 1.359967] driver_register+0x78/0x114
|
||||
[ 1.363807] __platform_driver_register+0x24/0x30
|
||||
[ 1.368515] gpmi_nand_driver_init+0x1c/0x28
|
||||
[ 1.372798] do_one_initcall+0xbc/0x238
|
||||
[ 1.376638] do_initcall_level+0x94/0xb4
|
||||
[ 1.380565] do_initcalls+0x54/0x94
|
||||
[ 1.384058] do_basic_setup+0x1c/0x28
|
||||
[ 1.387724] kernel_init_freeable+0x110/0x188
|
||||
[ 1.392084] kernel_init+0x20/0x1a0
|
||||
[ 1.395578] ret_from_fork+0x10/0x20
|
||||
[ 1.399157] ---[ end trace 0000000000000000 ]---
|
||||
[ 1.403782] ------------[ cut here ]------------
|
||||
|
||||
Reported-by: Han Xu <han.xu@nxp.com>
|
||||
Fixes: ad9b10d1eaada169 ("mtd: core: introduce of support for dynamic partitions")
|
||||
Signed-off-by: Rafał Miłecki <rafal@milecki.pl>
|
||||
Tested-by: Han Xu <han.xu@nxp.com>
|
||||
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
|
||||
Link: https://lore.kernel.org/linux-mtd/20221018051822.28685-1-zajec5@gmail.com
|
||||
---
|
||||
drivers/mtd/mtdcore.c | 2 +-
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
--- a/drivers/mtd/mtdcore.c
|
||||
+++ b/drivers/mtd/mtdcore.c
|
||||
@@ -580,7 +580,7 @@ static void mtd_check_of_node(struct mtd
|
||||
if (!mtd_is_partition(mtd))
|
||||
return;
|
||||
parent = mtd->parent;
|
||||
- parent_dn = dev_of_node(&parent->dev);
|
||||
+ parent_dn = of_node_get(dev_of_node(&parent->dev));
|
||||
if (!parent_dn)
|
||||
return;
|
||||
|
@ -0,0 +1,65 @@
|
||||
From 63db0cb35e1cb3b3c134906d1062f65513fdda2d Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= <rafal@milecki.pl>
|
||||
Date: Tue, 4 Oct 2022 10:37:09 +0200
|
||||
Subject: [PATCH] mtd: core: simplify (a bit) code find partition-matching
|
||||
dynamic OF node
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
1. Don't hardcode "partition-" string twice
|
||||
2. Use simpler logic & use ->name to avoid of_property_read_string()
|
||||
3. Use mtd_get_of_node() helper
|
||||
|
||||
Cc: Christian Marangi <ansuelsmth@gmail.com>
|
||||
Signed-off-by: Rafał Miłecki <rafal@milecki.pl>
|
||||
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
|
||||
Link: https://lore.kernel.org/linux-mtd/20221004083710.27704-1-zajec5@gmail.com
|
||||
---
|
||||
drivers/mtd/mtdcore.c | 16 +++++++---------
|
||||
1 file changed, 7 insertions(+), 9 deletions(-)
|
||||
|
||||
--- a/drivers/mtd/mtdcore.c
|
||||
+++ b/drivers/mtd/mtdcore.c
|
||||
@@ -569,18 +569,16 @@ static void mtd_check_of_node(struct mtd
|
||||
struct device_node *partitions, *parent_dn, *mtd_dn = NULL;
|
||||
const char *pname, *prefix = "partition-";
|
||||
int plen, mtd_name_len, offset, prefix_len;
|
||||
- struct mtd_info *parent;
|
||||
bool found = false;
|
||||
|
||||
/* Check if MTD already has a device node */
|
||||
- if (dev_of_node(&mtd->dev))
|
||||
+ if (mtd_get_of_node(mtd))
|
||||
return;
|
||||
|
||||
/* Check if a partitions node exist */
|
||||
if (!mtd_is_partition(mtd))
|
||||
return;
|
||||
- parent = mtd->parent;
|
||||
- parent_dn = of_node_get(dev_of_node(&parent->dev));
|
||||
+ parent_dn = of_node_get(mtd_get_of_node(mtd->parent));
|
||||
if (!parent_dn)
|
||||
return;
|
||||
|
||||
@@ -593,15 +591,15 @@ static void mtd_check_of_node(struct mtd
|
||||
|
||||
/* Search if a partition is defined with the same name */
|
||||
for_each_child_of_node(partitions, mtd_dn) {
|
||||
- offset = 0;
|
||||
-
|
||||
/* Skip partition with no/wrong prefix */
|
||||
- if (!of_node_name_prefix(mtd_dn, "partition-"))
|
||||
+ if (!of_node_name_prefix(mtd_dn, prefix))
|
||||
continue;
|
||||
|
||||
/* Label have priority. Check that first */
|
||||
- if (of_property_read_string(mtd_dn, "label", &pname)) {
|
||||
- of_property_read_string(mtd_dn, "name", &pname);
|
||||
+ if (!of_property_read_string(mtd_dn, "label", &pname)) {
|
||||
+ offset = 0;
|
||||
+ } else {
|
||||
+ pname = mtd_dn->name;
|
||||
offset = prefix_len;
|
||||
}
|
||||
|
@ -0,0 +1,84 @@
|
||||
From ddb8cefb7af288950447ca6eeeafb09977dab56f Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= <rafal@milecki.pl>
|
||||
Date: Tue, 4 Oct 2022 10:37:10 +0200
|
||||
Subject: [PATCH] mtd: core: try to find OF node for every MTD partition
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
So far this feature was limited to the top-level "nvmem-cells" node.
|
||||
There are multiple parsers creating partitions and subpartitions
|
||||
dynamically. Extend that code to handle them too.
|
||||
|
||||
This allows finding partition-* node for every MTD (sub)partition.
|
||||
|
||||
Random example:
|
||||
|
||||
partitions {
|
||||
compatible = "brcm,bcm947xx-cfe-partitions";
|
||||
|
||||
partition-firmware {
|
||||
compatible = "brcm,trx";
|
||||
|
||||
partition-loader {
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
Cc: Christian Marangi <ansuelsmth@gmail.com>
|
||||
Signed-off-by: Rafał Miłecki <rafal@milecki.pl>
|
||||
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
|
||||
Link: https://lore.kernel.org/linux-mtd/20221004083710.27704-2-zajec5@gmail.com
|
||||
---
|
||||
drivers/mtd/mtdcore.c | 18 ++++++------------
|
||||
1 file changed, 6 insertions(+), 12 deletions(-)
|
||||
|
||||
--- a/drivers/mtd/mtdcore.c
|
||||
+++ b/drivers/mtd/mtdcore.c
|
||||
@@ -569,20 +569,22 @@ static void mtd_check_of_node(struct mtd
|
||||
struct device_node *partitions, *parent_dn, *mtd_dn = NULL;
|
||||
const char *pname, *prefix = "partition-";
|
||||
int plen, mtd_name_len, offset, prefix_len;
|
||||
- bool found = false;
|
||||
|
||||
/* Check if MTD already has a device node */
|
||||
if (mtd_get_of_node(mtd))
|
||||
return;
|
||||
|
||||
- /* Check if a partitions node exist */
|
||||
if (!mtd_is_partition(mtd))
|
||||
return;
|
||||
+
|
||||
parent_dn = of_node_get(mtd_get_of_node(mtd->parent));
|
||||
if (!parent_dn)
|
||||
return;
|
||||
|
||||
- partitions = of_get_child_by_name(parent_dn, "partitions");
|
||||
+ if (mtd_is_partition(mtd->parent))
|
||||
+ partitions = of_node_get(parent_dn);
|
||||
+ else
|
||||
+ partitions = of_get_child_by_name(parent_dn, "partitions");
|
||||
if (!partitions)
|
||||
goto exit_parent;
|
||||
|
||||
@@ -606,19 +608,11 @@ static void mtd_check_of_node(struct mtd
|
||||
plen = strlen(pname) - offset;
|
||||
if (plen == mtd_name_len &&
|
||||
!strncmp(mtd->name, pname + offset, plen)) {
|
||||
- found = true;
|
||||
+ mtd_set_of_node(mtd, mtd_dn);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
- if (!found)
|
||||
- goto exit_partitions;
|
||||
-
|
||||
- /* Set of_node only for nvmem */
|
||||
- if (of_device_is_compatible(mtd_dn, "nvmem-cells"))
|
||||
- mtd_set_of_node(mtd, mtd_dn);
|
||||
-
|
||||
-exit_partitions:
|
||||
of_node_put(partitions);
|
||||
exit_parent:
|
||||
of_node_put(parent_dn);
|
@ -0,0 +1,32 @@
|
||||
From 26bccc9671ba5e01f7153addbe94e7dc3f677375 Mon Sep 17 00:00:00 2001
|
||||
From: Bryan O'Donoghue <bryan.odonoghue@linaro.org>
|
||||
Date: Mon, 3 Jan 2022 03:03:16 +0000
|
||||
Subject: [PATCH 13/14] mtd: parsers: qcom: Don't print error message on
|
||||
-EPROBE_DEFER
|
||||
|
||||
Its possible for the main smem driver to not be loaded by the time we come
|
||||
along to parse the smem partition description but, this is a perfectly
|
||||
normal thing.
|
||||
|
||||
No need to print out an error message in this case.
|
||||
|
||||
Signed-off-by: Bryan O'Donoghue <bryan.odonoghue@linaro.org>
|
||||
Reviewed-by: Manivannan Sadhasivam <mani@kernel.org>
|
||||
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
|
||||
Link: https://lore.kernel.org/linux-mtd/20220103030316.58301-3-bryan.odonoghue@linaro.org
|
||||
---
|
||||
drivers/mtd/parsers/qcomsmempart.c | 3 ++-
|
||||
1 file changed, 2 insertions(+), 1 deletion(-)
|
||||
|
||||
--- a/drivers/mtd/parsers/qcomsmempart.c
|
||||
+++ b/drivers/mtd/parsers/qcomsmempart.c
|
||||
@@ -75,7 +75,8 @@ static int parse_qcomsmem_part(struct mt
|
||||
pr_debug("Parsing partition table info from SMEM\n");
|
||||
ptable = qcom_smem_get(SMEM_APPS, SMEM_AARM_PARTITION_TABLE, &len);
|
||||
if (IS_ERR(ptable)) {
|
||||
- pr_err("Error reading partition table header\n");
|
||||
+ if (PTR_ERR(ptable) != -EPROBE_DEFER)
|
||||
+ pr_err("Error reading partition table header\n");
|
||||
return PTR_ERR(ptable);
|
||||
}
|
||||
|
@ -0,0 +1,47 @@
|
||||
From 26422ac78e9d8767bd4aabfbae616b15edbf6a1b Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= <rafal@milecki.pl>
|
||||
Date: Sat, 22 Oct 2022 23:13:18 +0200
|
||||
Subject: [PATCH] mtd: core: set ROOT_DEV for partitions marked as rootfs in DT
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
This adds support for "linux,rootfs" binding that is used to mark flash
|
||||
partition containing rootfs. It's useful for devices using device tree
|
||||
that don't have bootloader passing root info in cmdline.
|
||||
|
||||
Signed-off-by: Rafał Miłecki <rafal@milecki.pl>
|
||||
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
|
||||
Link: https://lore.kernel.org/linux-mtd/20221022211318.32009-2-zajec5@gmail.com
|
||||
---
|
||||
drivers/mtd/mtdcore.c | 12 ++++++++++++
|
||||
1 file changed, 12 insertions(+)
|
||||
|
||||
--- a/drivers/mtd/mtdcore.c
|
||||
+++ b/drivers/mtd/mtdcore.c
|
||||
@@ -28,6 +28,7 @@
|
||||
#include <linux/leds.h>
|
||||
#include <linux/debugfs.h>
|
||||
#include <linux/nvmem-provider.h>
|
||||
+#include <linux/root_dev.h>
|
||||
|
||||
#include <linux/mtd/mtd.h>
|
||||
#include <linux/mtd/partitions.h>
|
||||
@@ -748,6 +749,17 @@ int add_mtd_device(struct mtd_info *mtd)
|
||||
not->add(mtd);
|
||||
|
||||
mutex_unlock(&mtd_table_mutex);
|
||||
+
|
||||
+ if (of_find_property(mtd_get_of_node(mtd), "linux,rootfs", NULL)) {
|
||||
+ if (IS_BUILTIN(CONFIG_MTD)) {
|
||||
+ pr_info("mtd: setting mtd%d (%s) as root device\n", mtd->index, mtd->name);
|
||||
+ ROOT_DEV = MKDEV(MTD_BLOCK_MAJOR, mtd->index);
|
||||
+ } else {
|
||||
+ pr_warn("mtd: can't set mtd%d (%s) as root device - mtd must be builtin\n",
|
||||
+ mtd->index, mtd->name);
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
/* We _know_ we aren't being removed, because
|
||||
our caller is still holding us here. So none
|
||||
of this try_ nonsense, and no bitching about it
|
@ -0,0 +1,33 @@
|
||||
From 2365f91c861cbfeef7141c69842848c7b2d3c2db Mon Sep 17 00:00:00 2001
|
||||
From: INAGAKI Hiroshi <musashino.open@gmail.com>
|
||||
Date: Sun, 13 Feb 2022 15:40:44 +0900
|
||||
Subject: [PATCH] mtd: parsers: trx: allow to use on MediaTek MIPS SoCs
|
||||
|
||||
Buffalo sells some router devices which have trx-formatted firmware,
|
||||
based on MediaTek MIPS SoCs. To use parser_trx on those devices, add
|
||||
"RALINK" to dependency and allow to compile for MediaTek MIPS SoCs.
|
||||
|
||||
examples:
|
||||
|
||||
- WCR-1166DS (MT7628)
|
||||
- WSR-1166DHP (MT7621)
|
||||
- WSR-2533DHP (MT7621)
|
||||
|
||||
Signed-off-by: INAGAKI Hiroshi <musashino.open@gmail.com>
|
||||
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
|
||||
Link: https://lore.kernel.org/linux-mtd/20220213064045.1781-1-musashino.open@gmail.com
|
||||
---
|
||||
drivers/mtd/parsers/Kconfig | 2 +-
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
--- a/drivers/mtd/parsers/Kconfig
|
||||
+++ b/drivers/mtd/parsers/Kconfig
|
||||
@@ -115,7 +115,7 @@ config MTD_AFS_PARTS
|
||||
|
||||
config MTD_PARSER_TRX
|
||||
tristate "Parser for TRX format partitions"
|
||||
- depends on MTD && (BCM47XX || ARCH_BCM_5301X || ARCH_MEDIATEK || COMPILE_TEST)
|
||||
+ depends on MTD && (BCM47XX || ARCH_BCM_5301X || ARCH_MEDIATEK || RALINK || COMPILE_TEST)
|
||||
help
|
||||
TRX is a firmware format used by Broadcom on their devices. It
|
||||
may contain up to 3/4 partitions (depending on the version).
|
@ -0,0 +1,58 @@
|
||||
From 573eec222bc82fb5e724586267fbbb1aed9ffd03 Mon Sep 17 00:00:00 2001
|
||||
From: Chuanhong Guo <gch981213@gmail.com>
|
||||
Date: Sun, 20 Mar 2022 17:59:58 +0800
|
||||
Subject: [PATCH 2/5] mtd: spinand: gigadevice: add support for GD5FxGQ4xExxG
|
||||
|
||||
Add support for:
|
||||
GD5F1GQ4RExxG
|
||||
GD5F2GQ4{U,R}ExxG
|
||||
|
||||
These chips differ from GD5F1GQ4UExxG only in chip ID, voltage
|
||||
and capacity.
|
||||
|
||||
Signed-off-by: Chuanhong Guo <gch981213@gmail.com>
|
||||
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
|
||||
Link: https://lore.kernel.org/linux-mtd/20220320100001.247905-3-gch981213@gmail.com
|
||||
---
|
||||
drivers/mtd/nand/spi/gigadevice.c | 30 ++++++++++++++++++++++++++++++
|
||||
1 file changed, 30 insertions(+)
|
||||
|
||||
--- a/drivers/mtd/nand/spi/gigadevice.c
|
||||
+++ b/drivers/mtd/nand/spi/gigadevice.c
|
||||
@@ -333,6 +333,36 @@ static const struct spinand_info gigadev
|
||||
SPINAND_HAS_QE_BIT,
|
||||
SPINAND_ECCINFO(&gd5fxgqx_variant2_ooblayout,
|
||||
gd5fxgq4uexxg_ecc_get_status)),
|
||||
+ SPINAND_INFO("GD5F1GQ4RExxG",
|
||||
+ SPINAND_ID(SPINAND_READID_METHOD_OPCODE_ADDR, 0xc1),
|
||||
+ NAND_MEMORG(1, 2048, 128, 64, 1024, 20, 1, 1, 1),
|
||||
+ NAND_ECCREQ(8, 512),
|
||||
+ SPINAND_INFO_OP_VARIANTS(&read_cache_variants,
|
||||
+ &write_cache_variants,
|
||||
+ &update_cache_variants),
|
||||
+ SPINAND_HAS_QE_BIT,
|
||||
+ SPINAND_ECCINFO(&gd5fxgqx_variant2_ooblayout,
|
||||
+ gd5fxgq4uexxg_ecc_get_status)),
|
||||
+ SPINAND_INFO("GD5F2GQ4UExxG",
|
||||
+ SPINAND_ID(SPINAND_READID_METHOD_OPCODE_ADDR, 0xd2),
|
||||
+ NAND_MEMORG(1, 2048, 128, 64, 2048, 40, 1, 1, 1),
|
||||
+ NAND_ECCREQ(8, 512),
|
||||
+ SPINAND_INFO_OP_VARIANTS(&read_cache_variants,
|
||||
+ &write_cache_variants,
|
||||
+ &update_cache_variants),
|
||||
+ SPINAND_HAS_QE_BIT,
|
||||
+ SPINAND_ECCINFO(&gd5fxgqx_variant2_ooblayout,
|
||||
+ gd5fxgq4uexxg_ecc_get_status)),
|
||||
+ SPINAND_INFO("GD5F2GQ4RExxG",
|
||||
+ SPINAND_ID(SPINAND_READID_METHOD_OPCODE_ADDR, 0xc2),
|
||||
+ NAND_MEMORG(1, 2048, 128, 64, 2048, 40, 1, 1, 1),
|
||||
+ NAND_ECCREQ(8, 512),
|
||||
+ SPINAND_INFO_OP_VARIANTS(&read_cache_variants,
|
||||
+ &write_cache_variants,
|
||||
+ &update_cache_variants),
|
||||
+ SPINAND_HAS_QE_BIT,
|
||||
+ SPINAND_ECCINFO(&gd5fxgqx_variant2_ooblayout,
|
||||
+ gd5fxgq4uexxg_ecc_get_status)),
|
||||
SPINAND_INFO("GD5F1GQ4UFxxG",
|
||||
SPINAND_ID(SPINAND_READID_METHOD_OPCODE, 0xb1, 0x48),
|
||||
NAND_MEMORG(1, 2048, 128, 64, 1024, 20, 1, 1, 1),
|
@ -0,0 +1,33 @@
|
||||
From 620a988813403318023296b61228ee8f3fcdb8e0 Mon Sep 17 00:00:00 2001
|
||||
From: Chuanhong Guo <gch981213@gmail.com>
|
||||
Date: Sun, 20 Mar 2022 17:59:59 +0800
|
||||
Subject: [PATCH 3/5] mtd: spinand: gigadevice: add support for GD5F1GQ5RExxG
|
||||
|
||||
This chip is the 1.8v version of GD5F1GQ5UExxG.
|
||||
|
||||
Signed-off-by: Chuanhong Guo <gch981213@gmail.com>
|
||||
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
|
||||
Link: https://lore.kernel.org/linux-mtd/20220320100001.247905-4-gch981213@gmail.com
|
||||
---
|
||||
drivers/mtd/nand/spi/gigadevice.c | 10 ++++++++++
|
||||
1 file changed, 10 insertions(+)
|
||||
|
||||
--- a/drivers/mtd/nand/spi/gigadevice.c
|
||||
+++ b/drivers/mtd/nand/spi/gigadevice.c
|
||||
@@ -383,6 +383,16 @@ static const struct spinand_info gigadev
|
||||
SPINAND_HAS_QE_BIT,
|
||||
SPINAND_ECCINFO(&gd5fxgqx_variant2_ooblayout,
|
||||
gd5fxgq5xexxg_ecc_get_status)),
|
||||
+ SPINAND_INFO("GD5F1GQ5RExxG",
|
||||
+ SPINAND_ID(SPINAND_READID_METHOD_OPCODE_DUMMY, 0x41),
|
||||
+ NAND_MEMORG(1, 2048, 128, 64, 1024, 20, 1, 1, 1),
|
||||
+ NAND_ECCREQ(4, 512),
|
||||
+ SPINAND_INFO_OP_VARIANTS(&read_cache_variants_1gq5,
|
||||
+ &write_cache_variants,
|
||||
+ &update_cache_variants),
|
||||
+ SPINAND_HAS_QE_BIT,
|
||||
+ SPINAND_ECCINFO(&gd5fxgqx_variant2_ooblayout,
|
||||
+ gd5fxgq5xexxg_ecc_get_status)),
|
||||
};
|
||||
|
||||
static const struct spinand_manufacturer_ops gigadevice_spinand_manuf_ops = {
|
@ -0,0 +1,84 @@
|
||||
From 194ec04b3a9e7fa97d1fbef296410631bc3cf1c8 Mon Sep 17 00:00:00 2001
|
||||
From: Chuanhong Guo <gch981213@gmail.com>
|
||||
Date: Sun, 20 Mar 2022 18:00:00 +0800
|
||||
Subject: [PATCH 4/5] mtd: spinand: gigadevice: add support for GD5F{2,
|
||||
4}GQ5xExxG
|
||||
|
||||
Add support for:
|
||||
GD5F2GQ5{U,R}ExxG
|
||||
GD5F4GQ6{U,R}ExxG
|
||||
|
||||
These chips uses 4 dummy bytes for quad io and 2 dummy bytes for dual io.
|
||||
Besides that and memory layout, they are identical to their 1G variant.
|
||||
|
||||
Signed-off-by: Chuanhong Guo <gch981213@gmail.com>
|
||||
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
|
||||
Link: https://lore.kernel.org/linux-mtd/20220320100001.247905-5-gch981213@gmail.com
|
||||
---
|
||||
drivers/mtd/nand/spi/gigadevice.c | 48 +++++++++++++++++++++++++++++++
|
||||
1 file changed, 48 insertions(+)
|
||||
|
||||
--- a/drivers/mtd/nand/spi/gigadevice.c
|
||||
+++ b/drivers/mtd/nand/spi/gigadevice.c
|
||||
@@ -47,6 +47,14 @@ static SPINAND_OP_VARIANTS(read_cache_va
|
||||
SPINAND_PAGE_READ_FROM_CACHE_OP(true, 0, 1, NULL, 0),
|
||||
SPINAND_PAGE_READ_FROM_CACHE_OP(false, 0, 1, NULL, 0));
|
||||
|
||||
+static SPINAND_OP_VARIANTS(read_cache_variants_2gq5,
|
||||
+ SPINAND_PAGE_READ_FROM_CACHE_QUADIO_OP(0, 4, NULL, 0),
|
||||
+ SPINAND_PAGE_READ_FROM_CACHE_X4_OP(0, 1, NULL, 0),
|
||||
+ SPINAND_PAGE_READ_FROM_CACHE_DUALIO_OP(0, 2, NULL, 0),
|
||||
+ SPINAND_PAGE_READ_FROM_CACHE_X2_OP(0, 1, NULL, 0),
|
||||
+ SPINAND_PAGE_READ_FROM_CACHE_OP(true, 0, 1, NULL, 0),
|
||||
+ SPINAND_PAGE_READ_FROM_CACHE_OP(false, 0, 1, NULL, 0));
|
||||
+
|
||||
static SPINAND_OP_VARIANTS(write_cache_variants,
|
||||
SPINAND_PROG_LOAD_X4(true, 0, NULL, 0),
|
||||
SPINAND_PROG_LOAD(true, 0, NULL, 0));
|
||||
@@ -391,6 +399,46 @@ static const struct spinand_info gigadev
|
||||
&write_cache_variants,
|
||||
&update_cache_variants),
|
||||
SPINAND_HAS_QE_BIT,
|
||||
+ SPINAND_ECCINFO(&gd5fxgqx_variant2_ooblayout,
|
||||
+ gd5fxgq5xexxg_ecc_get_status)),
|
||||
+ SPINAND_INFO("GD5F2GQ5UExxG",
|
||||
+ SPINAND_ID(SPINAND_READID_METHOD_OPCODE_DUMMY, 0x52),
|
||||
+ NAND_MEMORG(1, 2048, 128, 64, 2048, 40, 1, 1, 1),
|
||||
+ NAND_ECCREQ(4, 512),
|
||||
+ SPINAND_INFO_OP_VARIANTS(&read_cache_variants_2gq5,
|
||||
+ &write_cache_variants,
|
||||
+ &update_cache_variants),
|
||||
+ SPINAND_HAS_QE_BIT,
|
||||
+ SPINAND_ECCINFO(&gd5fxgqx_variant2_ooblayout,
|
||||
+ gd5fxgq5xexxg_ecc_get_status)),
|
||||
+ SPINAND_INFO("GD5F2GQ5RExxG",
|
||||
+ SPINAND_ID(SPINAND_READID_METHOD_OPCODE_DUMMY, 0x42),
|
||||
+ NAND_MEMORG(1, 2048, 128, 64, 2048, 40, 1, 1, 1),
|
||||
+ NAND_ECCREQ(4, 512),
|
||||
+ SPINAND_INFO_OP_VARIANTS(&read_cache_variants_2gq5,
|
||||
+ &write_cache_variants,
|
||||
+ &update_cache_variants),
|
||||
+ SPINAND_HAS_QE_BIT,
|
||||
+ SPINAND_ECCINFO(&gd5fxgqx_variant2_ooblayout,
|
||||
+ gd5fxgq5xexxg_ecc_get_status)),
|
||||
+ SPINAND_INFO("GD5F4GQ6UExxG",
|
||||
+ SPINAND_ID(SPINAND_READID_METHOD_OPCODE_DUMMY, 0x55),
|
||||
+ NAND_MEMORG(1, 2048, 128, 64, 2048, 40, 1, 2, 1),
|
||||
+ NAND_ECCREQ(4, 512),
|
||||
+ SPINAND_INFO_OP_VARIANTS(&read_cache_variants_2gq5,
|
||||
+ &write_cache_variants,
|
||||
+ &update_cache_variants),
|
||||
+ SPINAND_HAS_QE_BIT,
|
||||
+ SPINAND_ECCINFO(&gd5fxgqx_variant2_ooblayout,
|
||||
+ gd5fxgq5xexxg_ecc_get_status)),
|
||||
+ SPINAND_INFO("GD5F4GQ6RExxG",
|
||||
+ SPINAND_ID(SPINAND_READID_METHOD_OPCODE_DUMMY, 0x45),
|
||||
+ NAND_MEMORG(1, 2048, 128, 64, 2048, 40, 1, 2, 1),
|
||||
+ NAND_ECCREQ(4, 512),
|
||||
+ SPINAND_INFO_OP_VARIANTS(&read_cache_variants_2gq5,
|
||||
+ &write_cache_variants,
|
||||
+ &update_cache_variants),
|
||||
+ SPINAND_HAS_QE_BIT,
|
||||
SPINAND_ECCINFO(&gd5fxgqx_variant2_ooblayout,
|
||||
gd5fxgq5xexxg_ecc_get_status)),
|
||||
};
|
@ -0,0 +1,91 @@
|
||||
From 54647cd003c08b714474a5b599a147ec6a160486 Mon Sep 17 00:00:00 2001
|
||||
From: Chuanhong Guo <gch981213@gmail.com>
|
||||
Date: Sun, 20 Mar 2022 18:00:01 +0800
|
||||
Subject: [PATCH 5/5] mtd: spinand: gigadevice: add support for GD5FxGM7xExxG
|
||||
|
||||
Add support for:
|
||||
GD5F{1,2}GM7{U,R}ExxG
|
||||
GD5F4GM8{U,R}ExxG
|
||||
|
||||
These are new 27nm counterparts for the GD5FxGQ4 chips from GigaDevice
|
||||
with 8b/512b on-die ECC capability.
|
||||
These chips (and currently supported GD5FxGQ5 chips) have QIO DTR
|
||||
instruction for reading page cache. It isn't added in this patch because
|
||||
I don't have a DTR spi controller for testing.
|
||||
|
||||
Signed-off-by: Chuanhong Guo <gch981213@gmail.com>
|
||||
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
|
||||
Link: https://lore.kernel.org/linux-mtd/20220320100001.247905-6-gch981213@gmail.com
|
||||
---
|
||||
drivers/mtd/nand/spi/gigadevice.c | 60 +++++++++++++++++++++++++++++++
|
||||
1 file changed, 60 insertions(+)
|
||||
|
||||
--- a/drivers/mtd/nand/spi/gigadevice.c
|
||||
+++ b/drivers/mtd/nand/spi/gigadevice.c
|
||||
@@ -441,6 +441,66 @@ static const struct spinand_info gigadev
|
||||
SPINAND_HAS_QE_BIT,
|
||||
SPINAND_ECCINFO(&gd5fxgqx_variant2_ooblayout,
|
||||
gd5fxgq5xexxg_ecc_get_status)),
|
||||
+ SPINAND_INFO("GD5F1GM7UExxG",
|
||||
+ SPINAND_ID(SPINAND_READID_METHOD_OPCODE_DUMMY, 0x91),
|
||||
+ NAND_MEMORG(1, 2048, 128, 64, 1024, 20, 1, 1, 1),
|
||||
+ NAND_ECCREQ(8, 512),
|
||||
+ SPINAND_INFO_OP_VARIANTS(&read_cache_variants_1gq5,
|
||||
+ &write_cache_variants,
|
||||
+ &update_cache_variants),
|
||||
+ SPINAND_HAS_QE_BIT,
|
||||
+ SPINAND_ECCINFO(&gd5fxgqx_variant2_ooblayout,
|
||||
+ gd5fxgq4uexxg_ecc_get_status)),
|
||||
+ SPINAND_INFO("GD5F1GM7RExxG",
|
||||
+ SPINAND_ID(SPINAND_READID_METHOD_OPCODE_DUMMY, 0x81),
|
||||
+ NAND_MEMORG(1, 2048, 128, 64, 1024, 20, 1, 1, 1),
|
||||
+ NAND_ECCREQ(8, 512),
|
||||
+ SPINAND_INFO_OP_VARIANTS(&read_cache_variants_1gq5,
|
||||
+ &write_cache_variants,
|
||||
+ &update_cache_variants),
|
||||
+ SPINAND_HAS_QE_BIT,
|
||||
+ SPINAND_ECCINFO(&gd5fxgqx_variant2_ooblayout,
|
||||
+ gd5fxgq4uexxg_ecc_get_status)),
|
||||
+ SPINAND_INFO("GD5F2GM7UExxG",
|
||||
+ SPINAND_ID(SPINAND_READID_METHOD_OPCODE_DUMMY, 0x92),
|
||||
+ NAND_MEMORG(1, 2048, 128, 64, 2048, 40, 1, 1, 1),
|
||||
+ NAND_ECCREQ(8, 512),
|
||||
+ SPINAND_INFO_OP_VARIANTS(&read_cache_variants_1gq5,
|
||||
+ &write_cache_variants,
|
||||
+ &update_cache_variants),
|
||||
+ SPINAND_HAS_QE_BIT,
|
||||
+ SPINAND_ECCINFO(&gd5fxgqx_variant2_ooblayout,
|
||||
+ gd5fxgq4uexxg_ecc_get_status)),
|
||||
+ SPINAND_INFO("GD5F2GM7RExxG",
|
||||
+ SPINAND_ID(SPINAND_READID_METHOD_OPCODE_DUMMY, 0x82),
|
||||
+ NAND_MEMORG(1, 2048, 128, 64, 2048, 40, 1, 1, 1),
|
||||
+ NAND_ECCREQ(8, 512),
|
||||
+ SPINAND_INFO_OP_VARIANTS(&read_cache_variants_1gq5,
|
||||
+ &write_cache_variants,
|
||||
+ &update_cache_variants),
|
||||
+ SPINAND_HAS_QE_BIT,
|
||||
+ SPINAND_ECCINFO(&gd5fxgqx_variant2_ooblayout,
|
||||
+ gd5fxgq4uexxg_ecc_get_status)),
|
||||
+ SPINAND_INFO("GD5F4GM8UExxG",
|
||||
+ SPINAND_ID(SPINAND_READID_METHOD_OPCODE_DUMMY, 0x95),
|
||||
+ NAND_MEMORG(1, 2048, 128, 64, 4096, 80, 1, 1, 1),
|
||||
+ NAND_ECCREQ(8, 512),
|
||||
+ SPINAND_INFO_OP_VARIANTS(&read_cache_variants_1gq5,
|
||||
+ &write_cache_variants,
|
||||
+ &update_cache_variants),
|
||||
+ SPINAND_HAS_QE_BIT,
|
||||
+ SPINAND_ECCINFO(&gd5fxgqx_variant2_ooblayout,
|
||||
+ gd5fxgq4uexxg_ecc_get_status)),
|
||||
+ SPINAND_INFO("GD5F4GM8RExxG",
|
||||
+ SPINAND_ID(SPINAND_READID_METHOD_OPCODE_DUMMY, 0x85),
|
||||
+ NAND_MEMORG(1, 2048, 128, 64, 4096, 80, 1, 1, 1),
|
||||
+ NAND_ECCREQ(8, 512),
|
||||
+ SPINAND_INFO_OP_VARIANTS(&read_cache_variants_1gq5,
|
||||
+ &write_cache_variants,
|
||||
+ &update_cache_variants),
|
||||
+ SPINAND_HAS_QE_BIT,
|
||||
+ SPINAND_ECCINFO(&gd5fxgqx_variant2_ooblayout,
|
||||
+ gd5fxgq4uexxg_ecc_get_status)),
|
||||
};
|
||||
|
||||
static const struct spinand_manufacturer_ops gigadevice_spinand_manuf_ops = {
|
@ -0,0 +1,229 @@
|
||||
From aec4d5f5ffd0f0092bd9dc21ea90e0bc237d4b74 Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= <rafal@milecki.pl>
|
||||
Date: Sat, 15 Oct 2022 11:29:50 +0200
|
||||
Subject: [PATCH] mtd: parsers: add TP-Link SafeLoader partitions table parser
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
This parser deals with most TP-Link home routers. It reads info about
|
||||
partitions and registers them in the MTD subsystem.
|
||||
|
||||
Example from TP-Link Archer C5 V2:
|
||||
|
||||
spi-nor spi0.0: s25fl128s1 (16384 Kbytes)
|
||||
15 tplink-safeloader partitions found on MTD device spi0.0
|
||||
Creating 15 MTD partitions on "spi0.0":
|
||||
0x000000000000-0x000000040000 : "fs-uboot"
|
||||
0x000000040000-0x000000440000 : "os-image"
|
||||
0x000000440000-0x000000e40000 : "rootfs"
|
||||
0x000000e40000-0x000000e40200 : "default-mac"
|
||||
0x000000e40200-0x000000e40400 : "pin"
|
||||
0x000000e40400-0x000000e40600 : "product-info"
|
||||
0x000000e50000-0x000000e60000 : "partition-table"
|
||||
0x000000e60000-0x000000e60200 : "soft-version"
|
||||
0x000000e61000-0x000000e70000 : "support-list"
|
||||
0x000000e70000-0x000000e80000 : "profile"
|
||||
0x000000e80000-0x000000e90000 : "default-config"
|
||||
0x000000e90000-0x000000ee0000 : "user-config"
|
||||
0x000000ee0000-0x000000fe0000 : "log"
|
||||
0x000000fe0000-0x000000ff0000 : "radio_bk"
|
||||
0x000000ff0000-0x000001000000 : "radio"
|
||||
|
||||
Signed-off-by: Rafał Miłecki <rafal@milecki.pl>
|
||||
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
|
||||
Link: https://lore.kernel.org/linux-mtd/20221015092950.27467-2-zajec5@gmail.com
|
||||
---
|
||||
drivers/mtd/parsers/Kconfig | 15 +++
|
||||
drivers/mtd/parsers/Makefile | 1 +
|
||||
drivers/mtd/parsers/tplink_safeloader.c | 150 ++++++++++++++++++++++++
|
||||
3 files changed, 166 insertions(+)
|
||||
create mode 100644 drivers/mtd/parsers/tplink_safeloader.c
|
||||
|
||||
--- a/drivers/mtd/parsers/Kconfig
|
||||
+++ b/drivers/mtd/parsers/Kconfig
|
||||
@@ -113,6 +113,21 @@ config MTD_AFS_PARTS
|
||||
for your particular device. It won't happen automatically. The
|
||||
'physmap' map driver (CONFIG_MTD_PHYSMAP) does this, for example.
|
||||
|
||||
+config MTD_PARSER_TPLINK_SAFELOADER
|
||||
+ tristate "TP-Link Safeloader partitions parser"
|
||||
+ depends on MTD && (ARCH_BCM_5301X || ATH79 || SOC_MT7620 || SOC_MT7621 || COMPILE_TEST)
|
||||
+ help
|
||||
+ TP-Link home routers use flash partitions to store various data. Info
|
||||
+ about flash space layout is stored in a partitions table using a
|
||||
+ custom ASCII-based format.
|
||||
+
|
||||
+ That format was first found in devices with SafeLoader bootloader and
|
||||
+ was named after it. Later it was adapted to CFE and U-Boot
|
||||
+ bootloaders.
|
||||
+
|
||||
+ This driver reads partitions table, parses it and creates MTD
|
||||
+ partitions.
|
||||
+
|
||||
config MTD_PARSER_TRX
|
||||
tristate "Parser for TRX format partitions"
|
||||
depends on MTD && (BCM47XX || ARCH_BCM_5301X || ARCH_MEDIATEK || RALINK || COMPILE_TEST)
|
||||
--- a/drivers/mtd/parsers/Makefile
|
||||
+++ b/drivers/mtd/parsers/Makefile
|
||||
@@ -9,6 +9,7 @@ ofpart-$(CONFIG_MTD_OF_PARTS_BCM4908) +=
|
||||
ofpart-$(CONFIG_MTD_OF_PARTS_LINKSYS_NS)+= ofpart_linksys_ns.o
|
||||
obj-$(CONFIG_MTD_PARSER_IMAGETAG) += parser_imagetag.o
|
||||
obj-$(CONFIG_MTD_AFS_PARTS) += afs.o
|
||||
+obj-$(CONFIG_MTD_PARSER_TPLINK_SAFELOADER) += tplink_safeloader.o
|
||||
obj-$(CONFIG_MTD_PARSER_TRX) += parser_trx.o
|
||||
obj-$(CONFIG_MTD_SERCOMM_PARTS) += scpart.o
|
||||
obj-$(CONFIG_MTD_SHARPSL_PARTS) += sharpslpart.o
|
||||
--- /dev/null
|
||||
+++ b/drivers/mtd/parsers/tplink_safeloader.c
|
||||
@@ -0,0 +1,150 @@
|
||||
+// SPDX-License-Identifier: GPL-2.0-only
|
||||
+/*
|
||||
+ * Copyright © 2022 Rafał Miłecki <rafal@milecki.pl>
|
||||
+ */
|
||||
+
|
||||
+#include <linux/kernel.h>
|
||||
+#include <linux/module.h>
|
||||
+#include <linux/mtd/mtd.h>
|
||||
+#include <linux/mtd/partitions.h>
|
||||
+#include <linux/of.h>
|
||||
+#include <linux/slab.h>
|
||||
+
|
||||
+#define TPLINK_SAFELOADER_DATA_OFFSET 4
|
||||
+#define TPLINK_SAFELOADER_MAX_PARTS 32
|
||||
+
|
||||
+struct safeloader_cmn_header {
|
||||
+ __be32 size;
|
||||
+ uint32_t unused;
|
||||
+} __packed;
|
||||
+
|
||||
+static void *mtd_parser_tplink_safeloader_read_table(struct mtd_info *mtd)
|
||||
+{
|
||||
+ struct safeloader_cmn_header hdr;
|
||||
+ struct device_node *np;
|
||||
+ size_t bytes_read;
|
||||
+ size_t offset;
|
||||
+ size_t size;
|
||||
+ char *buf;
|
||||
+ int err;
|
||||
+
|
||||
+ np = mtd_get_of_node(mtd);
|
||||
+ if (mtd_is_partition(mtd))
|
||||
+ of_node_get(np);
|
||||
+ else
|
||||
+ np = of_get_child_by_name(np, "partitions");
|
||||
+
|
||||
+ if (of_property_read_u32(np, "partitions-table-offset", (u32 *)&offset)) {
|
||||
+ pr_err("Failed to get partitions table offset\n");
|
||||
+ goto err_put;
|
||||
+ }
|
||||
+
|
||||
+ err = mtd_read(mtd, offset, sizeof(hdr), &bytes_read, (uint8_t *)&hdr);
|
||||
+ if (err && !mtd_is_bitflip(err)) {
|
||||
+ pr_err("Failed to read from %s at 0x%zx\n", mtd->name, offset);
|
||||
+ goto err_put;
|
||||
+ }
|
||||
+
|
||||
+ size = be32_to_cpu(hdr.size);
|
||||
+
|
||||
+ buf = kmalloc(size + 1, GFP_KERNEL);
|
||||
+ if (!buf)
|
||||
+ goto err_put;
|
||||
+
|
||||
+ err = mtd_read(mtd, offset + sizeof(hdr), size, &bytes_read, buf);
|
||||
+ if (err && !mtd_is_bitflip(err)) {
|
||||
+ pr_err("Failed to read from %s at 0x%zx\n", mtd->name, offset + sizeof(hdr));
|
||||
+ goto err_kfree;
|
||||
+ }
|
||||
+
|
||||
+ buf[size] = '\0';
|
||||
+
|
||||
+ of_node_put(np);
|
||||
+
|
||||
+ return buf;
|
||||
+
|
||||
+err_kfree:
|
||||
+ kfree(buf);
|
||||
+err_put:
|
||||
+ of_node_put(np);
|
||||
+ return NULL;
|
||||
+}
|
||||
+
|
||||
+static int mtd_parser_tplink_safeloader_parse(struct mtd_info *mtd,
|
||||
+ const struct mtd_partition **pparts,
|
||||
+ struct mtd_part_parser_data *data)
|
||||
+{
|
||||
+ struct mtd_partition *parts;
|
||||
+ char name[65];
|
||||
+ size_t offset;
|
||||
+ size_t bytes;
|
||||
+ char *buf;
|
||||
+ int idx;
|
||||
+ int err;
|
||||
+
|
||||
+ parts = kcalloc(TPLINK_SAFELOADER_MAX_PARTS, sizeof(*parts), GFP_KERNEL);
|
||||
+ if (!parts) {
|
||||
+ err = -ENOMEM;
|
||||
+ goto err_out;
|
||||
+ }
|
||||
+
|
||||
+ buf = mtd_parser_tplink_safeloader_read_table(mtd);
|
||||
+ if (!buf) {
|
||||
+ err = -ENOENT;
|
||||
+ goto err_out;
|
||||
+ }
|
||||
+
|
||||
+ for (idx = 0, offset = TPLINK_SAFELOADER_DATA_OFFSET;
|
||||
+ idx < TPLINK_SAFELOADER_MAX_PARTS &&
|
||||
+ sscanf(buf + offset, "partition %64s base 0x%llx size 0x%llx%zn\n",
|
||||
+ name, &parts[idx].offset, &parts[idx].size, &bytes) == 3;
|
||||
+ idx++, offset += bytes + 1) {
|
||||
+ parts[idx].name = kstrdup(name, GFP_KERNEL);
|
||||
+ if (!parts[idx].name) {
|
||||
+ err = -ENOMEM;
|
||||
+ goto err_free;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ if (idx == TPLINK_SAFELOADER_MAX_PARTS)
|
||||
+ pr_warn("Reached maximum number of partitions!\n");
|
||||
+
|
||||
+ kfree(buf);
|
||||
+
|
||||
+ *pparts = parts;
|
||||
+
|
||||
+ return idx;
|
||||
+
|
||||
+err_free:
|
||||
+ for (idx -= 1; idx >= 0; idx--)
|
||||
+ kfree(parts[idx].name);
|
||||
+err_out:
|
||||
+ return err;
|
||||
+};
|
||||
+
|
||||
+static void mtd_parser_tplink_safeloader_cleanup(const struct mtd_partition *pparts,
|
||||
+ int nr_parts)
|
||||
+{
|
||||
+ int i;
|
||||
+
|
||||
+ for (i = 0; i < nr_parts; i++)
|
||||
+ kfree(pparts[i].name);
|
||||
+
|
||||
+ kfree(pparts);
|
||||
+}
|
||||
+
|
||||
+static const struct of_device_id mtd_parser_tplink_safeloader_of_match_table[] = {
|
||||
+ { .compatible = "tplink,safeloader-partitions" },
|
||||
+ {},
|
||||
+};
|
||||
+MODULE_DEVICE_TABLE(of, mtd_parser_tplink_safeloader_of_match_table);
|
||||
+
|
||||
+static struct mtd_part_parser mtd_parser_tplink_safeloader = {
|
||||
+ .parse_fn = mtd_parser_tplink_safeloader_parse,
|
||||
+ .cleanup = mtd_parser_tplink_safeloader_cleanup,
|
||||
+ .name = "tplink-safeloader",
|
||||
+ .of_match_table = mtd_parser_tplink_safeloader_of_match_table,
|
||||
+};
|
||||
+module_mtd_part_parser(mtd_parser_tplink_safeloader);
|
||||
+
|
||||
+MODULE_LICENSE("GPL");
|
@ -0,0 +1,49 @@
|
||||
From 6abef37d16d0c570ef5a149e63762fba2a30804b Mon Sep 17 00:00:00 2001
|
||||
From: "Leon M. George" <leon@georgemail.eu>
|
||||
Date: Wed, 30 Mar 2022 16:16:56 +0200
|
||||
Subject: [PATCH] mtd: spi-nor: support eon en25qh256a variant
|
||||
|
||||
The EN25QH256A variant of the EN25QH256 doesn't initialize correctly from SFDP
|
||||
alone and only accesses memory below 8m (addr_width is 4 but read_opcode takes
|
||||
only 3 bytes).
|
||||
|
||||
Set SNOR_F_4B_OPCODES if the flash chip variant was detected using hwcaps.
|
||||
|
||||
The fix submitted upstream uses the PARSE_SFDP initializer that is not
|
||||
available in the kernel used with Openwrt.
|
||||
|
||||
Signed-off-by: Leon M. George <leon@georgemail.eu>
|
||||
---
|
||||
drivers/mtd/spi-nor/eon.c | 11 +++++++++++
|
||||
1 file changed, 11 insertions(+)
|
||||
|
||||
--- a/drivers/mtd/spi-nor/eon.c
|
||||
+++ b/drivers/mtd/spi-nor/eon.c
|
||||
@@ -8,6 +8,16 @@
|
||||
|
||||
#include "core.h"
|
||||
|
||||
+static void en25qh256_post_sfdp_fixups(struct spi_nor *nor)
|
||||
+{
|
||||
+ if (nor->params->hwcaps.mask & SNOR_HWCAPS_READ_1_1_4)
|
||||
+ nor->flags |= SNOR_F_4B_OPCODES;
|
||||
+}
|
||||
+
|
||||
+static const struct spi_nor_fixups en25qh256_fixups = {
|
||||
+ .post_sfdp = en25qh256_post_sfdp_fixups,
|
||||
+};
|
||||
+
|
||||
static const struct flash_info eon_parts[] = {
|
||||
/* EON -- en25xxx */
|
||||
{ "en25f32", INFO(0x1c3116, 0, 64 * 1024, 64, SECT_4K) },
|
||||
@@ -23,7 +33,9 @@ static const struct flash_info eon_parts
|
||||
{ "en25qh64", INFO(0x1c7017, 0, 64 * 1024, 128,
|
||||
SECT_4K | SPI_NOR_DUAL_READ) },
|
||||
{ "en25qh128", INFO(0x1c7018, 0, 64 * 1024, 256, 0) },
|
||||
- { "en25qh256", INFO(0x1c7019, 0, 64 * 1024, 512, 0) },
|
||||
+ { "en25qh256", INFO(0x1c7019, 0, 64 * 1024, 512,
|
||||
+ SPI_NOR_DUAL_READ)
|
||||
+ .fixups = &en25qh256_fixups },
|
||||
{ "en25s64", INFO(0x1c3817, 0, 64 * 1024, 128, SECT_4K) },
|
||||
};
|
||||
|
@ -0,0 +1,73 @@
|
||||
From e237285113963bd1dd2e925770aa8b3aa8a1894c Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Micha=C5=82=20K=C4=99pie=C5=84?= <kernel@kempniu.pl>
|
||||
Date: Wed, 29 Jun 2022 14:57:34 +0200
|
||||
Subject: [PATCH 1/4] mtd: track maximum number of bitflips for each read
|
||||
request
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
mtd_read_oob() callers are currently oblivious to the details of ECC
|
||||
errors detected during the read operation - they only learn (through the
|
||||
return value) whether any corrected bitflips or uncorrectable errors
|
||||
occurred. More detailed ECC information can be useful to user-space
|
||||
applications for making better-informed choices about moving data
|
||||
around.
|
||||
|
||||
Extend struct mtd_oob_ops with a pointer to a newly-introduced struct
|
||||
mtd_req_stats and set its 'max_bitflips' field to the maximum number of
|
||||
bitflips found in a single ECC step during the read operation performed
|
||||
by mtd_read_oob(). This is a prerequisite for ultimately passing that
|
||||
value back to user space.
|
||||
|
||||
Suggested-by: Boris Brezillon <boris.brezillon@collabora.com>
|
||||
Signed-off-by: Michał Kępień <kernel@kempniu.pl>
|
||||
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
|
||||
Link: https://lore.kernel.org/linux-mtd/20220629125737.14418-2-kernel@kempniu.pl
|
||||
---
|
||||
drivers/mtd/mtdcore.c | 5 +++++
|
||||
include/linux/mtd/mtd.h | 5 +++++
|
||||
2 files changed, 10 insertions(+)
|
||||
|
||||
--- a/drivers/mtd/mtdcore.c
|
||||
+++ b/drivers/mtd/mtdcore.c
|
||||
@@ -1676,6 +1676,9 @@ int mtd_read_oob(struct mtd_info *mtd, l
|
||||
if (!master->_read_oob && (!master->_read || ops->oobbuf))
|
||||
return -EOPNOTSUPP;
|
||||
|
||||
+ if (ops->stats)
|
||||
+ memset(ops->stats, 0, sizeof(*ops->stats));
|
||||
+
|
||||
if (mtd->flags & MTD_SLC_ON_MLC_EMULATION)
|
||||
ret_code = mtd_io_emulated_slc(mtd, from, true, ops);
|
||||
else
|
||||
@@ -1693,6 +1696,8 @@ int mtd_read_oob(struct mtd_info *mtd, l
|
||||
return ret_code;
|
||||
if (mtd->ecc_strength == 0)
|
||||
return 0; /* device lacks ecc */
|
||||
+ if (ops->stats)
|
||||
+ ops->stats->max_bitflips = ret_code;
|
||||
return ret_code >= mtd->bitflip_threshold ? -EUCLEAN : 0;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(mtd_read_oob);
|
||||
--- a/include/linux/mtd/mtd.h
|
||||
+++ b/include/linux/mtd/mtd.h
|
||||
@@ -40,6 +40,10 @@ struct mtd_erase_region_info {
|
||||
unsigned long *lockmap; /* If keeping bitmap of locks */
|
||||
};
|
||||
|
||||
+struct mtd_req_stats {
|
||||
+ unsigned int max_bitflips;
|
||||
+};
|
||||
+
|
||||
/**
|
||||
* struct mtd_oob_ops - oob operation operands
|
||||
* @mode: operation mode
|
||||
@@ -70,6 +74,7 @@ struct mtd_oob_ops {
|
||||
uint32_t ooboffs;
|
||||
uint8_t *datbuf;
|
||||
uint8_t *oobbuf;
|
||||
+ struct mtd_req_stats *stats;
|
||||
};
|
||||
|
||||
#define MTD_MAX_OOBFREE_ENTRIES_LARGE 32
|
@ -0,0 +1,325 @@
|
||||
From e97709c9d18903f5acd5fbe2985dd054da0432b1 Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Micha=C5=82=20K=C4=99pie=C5=84?= <kernel@kempniu.pl>
|
||||
Date: Wed, 29 Jun 2022 14:57:35 +0200
|
||||
Subject: [PATCH 2/4] mtd: always initialize 'stats' in struct mtd_oob_ops
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
As the 'stats' field in struct mtd_oob_ops is used in conditional
|
||||
expressions, ensure it is always zero-initialized in all such structures
|
||||
to prevent random stack garbage from being interpreted as a pointer.
|
||||
|
||||
Strictly speaking, this problem currently only needs to be fixed for
|
||||
struct mtd_oob_ops structures subsequently passed to mtd_read_oob().
|
||||
However, this commit goes a step further and makes all instances of
|
||||
struct mtd_oob_ops in the tree zero-initialized, in hope of preventing
|
||||
future problems, e.g. if struct mtd_req_stats gets extended with write
|
||||
statistics at some point.
|
||||
|
||||
Signed-off-by: Michał Kępień <kernel@kempniu.pl>
|
||||
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
|
||||
Link: https://lore.kernel.org/linux-mtd/20220629125737.14418-3-kernel@kempniu.pl
|
||||
---
|
||||
drivers/mtd/inftlcore.c | 6 +++---
|
||||
drivers/mtd/mtdswap.c | 6 +++---
|
||||
drivers/mtd/nand/onenand/onenand_base.c | 4 ++--
|
||||
drivers/mtd/nand/onenand/onenand_bbt.c | 2 +-
|
||||
drivers/mtd/nand/raw/nand_bbt.c | 8 ++++----
|
||||
drivers/mtd/nand/raw/sm_common.c | 2 +-
|
||||
drivers/mtd/nftlcore.c | 6 +++---
|
||||
drivers/mtd/sm_ftl.c | 4 ++--
|
||||
drivers/mtd/ssfdc.c | 2 +-
|
||||
drivers/mtd/tests/nandbiterrs.c | 2 +-
|
||||
drivers/mtd/tests/oobtest.c | 8 ++++----
|
||||
drivers/mtd/tests/readtest.c | 2 +-
|
||||
fs/jffs2/wbuf.c | 6 +++---
|
||||
13 files changed, 29 insertions(+), 29 deletions(-)
|
||||
|
||||
--- a/drivers/mtd/inftlcore.c
|
||||
+++ b/drivers/mtd/inftlcore.c
|
||||
@@ -136,7 +136,7 @@ static void inftl_remove_dev(struct mtd_
|
||||
int inftl_read_oob(struct mtd_info *mtd, loff_t offs, size_t len,
|
||||
size_t *retlen, uint8_t *buf)
|
||||
{
|
||||
- struct mtd_oob_ops ops;
|
||||
+ struct mtd_oob_ops ops = { };
|
||||
int res;
|
||||
|
||||
ops.mode = MTD_OPS_PLACE_OOB;
|
||||
@@ -156,7 +156,7 @@ int inftl_read_oob(struct mtd_info *mtd,
|
||||
int inftl_write_oob(struct mtd_info *mtd, loff_t offs, size_t len,
|
||||
size_t *retlen, uint8_t *buf)
|
||||
{
|
||||
- struct mtd_oob_ops ops;
|
||||
+ struct mtd_oob_ops ops = { };
|
||||
int res;
|
||||
|
||||
ops.mode = MTD_OPS_PLACE_OOB;
|
||||
@@ -176,7 +176,7 @@ int inftl_write_oob(struct mtd_info *mtd
|
||||
static int inftl_write(struct mtd_info *mtd, loff_t offs, size_t len,
|
||||
size_t *retlen, uint8_t *buf, uint8_t *oob)
|
||||
{
|
||||
- struct mtd_oob_ops ops;
|
||||
+ struct mtd_oob_ops ops = { };
|
||||
int res;
|
||||
|
||||
ops.mode = MTD_OPS_PLACE_OOB;
|
||||
--- a/drivers/mtd/mtdswap.c
|
||||
+++ b/drivers/mtd/mtdswap.c
|
||||
@@ -323,7 +323,7 @@ static int mtdswap_read_markers(struct m
|
||||
struct mtdswap_oobdata *data, *data2;
|
||||
int ret;
|
||||
loff_t offset;
|
||||
- struct mtd_oob_ops ops;
|
||||
+ struct mtd_oob_ops ops = { };
|
||||
|
||||
offset = mtdswap_eb_offset(d, eb);
|
||||
|
||||
@@ -370,7 +370,7 @@ static int mtdswap_write_marker(struct m
|
||||
struct mtdswap_oobdata n;
|
||||
int ret;
|
||||
loff_t offset;
|
||||
- struct mtd_oob_ops ops;
|
||||
+ struct mtd_oob_ops ops = { };
|
||||
|
||||
ops.ooboffs = 0;
|
||||
ops.oobbuf = (uint8_t *)&n;
|
||||
@@ -879,7 +879,7 @@ static unsigned int mtdswap_eblk_passes(
|
||||
loff_t base, pos;
|
||||
unsigned int *p1 = (unsigned int *)d->page_buf;
|
||||
unsigned char *p2 = (unsigned char *)d->oob_buf;
|
||||
- struct mtd_oob_ops ops;
|
||||
+ struct mtd_oob_ops ops = { };
|
||||
int ret;
|
||||
|
||||
ops.mode = MTD_OPS_AUTO_OOB;
|
||||
--- a/drivers/mtd/nand/onenand/onenand_base.c
|
||||
+++ b/drivers/mtd/nand/onenand/onenand_base.c
|
||||
@@ -2935,7 +2935,7 @@ static int do_otp_write(struct mtd_info
|
||||
struct onenand_chip *this = mtd->priv;
|
||||
unsigned char *pbuf = buf;
|
||||
int ret;
|
||||
- struct mtd_oob_ops ops;
|
||||
+ struct mtd_oob_ops ops = { };
|
||||
|
||||
/* Force buffer page aligned */
|
||||
if (len < mtd->writesize) {
|
||||
@@ -2977,7 +2977,7 @@ static int do_otp_lock(struct mtd_info *
|
||||
size_t *retlen, u_char *buf)
|
||||
{
|
||||
struct onenand_chip *this = mtd->priv;
|
||||
- struct mtd_oob_ops ops;
|
||||
+ struct mtd_oob_ops ops = { };
|
||||
int ret;
|
||||
|
||||
if (FLEXONENAND(this)) {
|
||||
--- a/drivers/mtd/nand/onenand/onenand_bbt.c
|
||||
+++ b/drivers/mtd/nand/onenand/onenand_bbt.c
|
||||
@@ -61,7 +61,7 @@ static int create_bbt(struct mtd_info *m
|
||||
int startblock;
|
||||
loff_t from;
|
||||
size_t readlen, ooblen;
|
||||
- struct mtd_oob_ops ops;
|
||||
+ struct mtd_oob_ops ops = { };
|
||||
int rgn;
|
||||
|
||||
printk(KERN_INFO "Scanning device for bad blocks\n");
|
||||
--- a/drivers/mtd/nand/raw/nand_bbt.c
|
||||
+++ b/drivers/mtd/nand/raw/nand_bbt.c
|
||||
@@ -313,7 +313,7 @@ static int scan_read_oob(struct nand_chi
|
||||
size_t len)
|
||||
{
|
||||
struct mtd_info *mtd = nand_to_mtd(this);
|
||||
- struct mtd_oob_ops ops;
|
||||
+ struct mtd_oob_ops ops = { };
|
||||
int res, ret = 0;
|
||||
|
||||
ops.mode = MTD_OPS_PLACE_OOB;
|
||||
@@ -354,7 +354,7 @@ static int scan_write_bbt(struct nand_ch
|
||||
uint8_t *buf, uint8_t *oob)
|
||||
{
|
||||
struct mtd_info *mtd = nand_to_mtd(this);
|
||||
- struct mtd_oob_ops ops;
|
||||
+ struct mtd_oob_ops ops = { };
|
||||
|
||||
ops.mode = MTD_OPS_PLACE_OOB;
|
||||
ops.ooboffs = 0;
|
||||
@@ -416,7 +416,7 @@ static int scan_block_fast(struct nand_c
|
||||
{
|
||||
struct mtd_info *mtd = nand_to_mtd(this);
|
||||
|
||||
- struct mtd_oob_ops ops;
|
||||
+ struct mtd_oob_ops ops = { };
|
||||
int ret, page_offset;
|
||||
|
||||
ops.ooblen = mtd->oobsize;
|
||||
@@ -756,7 +756,7 @@ static int write_bbt(struct nand_chip *t
|
||||
uint8_t rcode = td->reserved_block_code;
|
||||
size_t retlen, len = 0;
|
||||
loff_t to;
|
||||
- struct mtd_oob_ops ops;
|
||||
+ struct mtd_oob_ops ops = { };
|
||||
|
||||
ops.ooblen = mtd->oobsize;
|
||||
ops.ooboffs = 0;
|
||||
--- a/drivers/mtd/nand/raw/sm_common.c
|
||||
+++ b/drivers/mtd/nand/raw/sm_common.c
|
||||
@@ -99,7 +99,7 @@ static const struct mtd_ooblayout_ops oo
|
||||
static int sm_block_markbad(struct nand_chip *chip, loff_t ofs)
|
||||
{
|
||||
struct mtd_info *mtd = nand_to_mtd(chip);
|
||||
- struct mtd_oob_ops ops;
|
||||
+ struct mtd_oob_ops ops = { };
|
||||
struct sm_oob oob;
|
||||
int ret;
|
||||
|
||||
--- a/drivers/mtd/nftlcore.c
|
||||
+++ b/drivers/mtd/nftlcore.c
|
||||
@@ -124,7 +124,7 @@ int nftl_read_oob(struct mtd_info *mtd,
|
||||
size_t *retlen, uint8_t *buf)
|
||||
{
|
||||
loff_t mask = mtd->writesize - 1;
|
||||
- struct mtd_oob_ops ops;
|
||||
+ struct mtd_oob_ops ops = { };
|
||||
int res;
|
||||
|
||||
ops.mode = MTD_OPS_PLACE_OOB;
|
||||
@@ -145,7 +145,7 @@ int nftl_write_oob(struct mtd_info *mtd,
|
||||
size_t *retlen, uint8_t *buf)
|
||||
{
|
||||
loff_t mask = mtd->writesize - 1;
|
||||
- struct mtd_oob_ops ops;
|
||||
+ struct mtd_oob_ops ops = { };
|
||||
int res;
|
||||
|
||||
ops.mode = MTD_OPS_PLACE_OOB;
|
||||
@@ -168,7 +168,7 @@ static int nftl_write(struct mtd_info *m
|
||||
size_t *retlen, uint8_t *buf, uint8_t *oob)
|
||||
{
|
||||
loff_t mask = mtd->writesize - 1;
|
||||
- struct mtd_oob_ops ops;
|
||||
+ struct mtd_oob_ops ops = { };
|
||||
int res;
|
||||
|
||||
ops.mode = MTD_OPS_PLACE_OOB;
|
||||
--- a/drivers/mtd/sm_ftl.c
|
||||
+++ b/drivers/mtd/sm_ftl.c
|
||||
@@ -239,7 +239,7 @@ static int sm_read_sector(struct sm_ftl
|
||||
uint8_t *buffer, struct sm_oob *oob)
|
||||
{
|
||||
struct mtd_info *mtd = ftl->trans->mtd;
|
||||
- struct mtd_oob_ops ops;
|
||||
+ struct mtd_oob_ops ops = { };
|
||||
struct sm_oob tmp_oob;
|
||||
int ret = -EIO;
|
||||
int try = 0;
|
||||
@@ -323,7 +323,7 @@ static int sm_write_sector(struct sm_ftl
|
||||
int zone, int block, int boffset,
|
||||
uint8_t *buffer, struct sm_oob *oob)
|
||||
{
|
||||
- struct mtd_oob_ops ops;
|
||||
+ struct mtd_oob_ops ops = { };
|
||||
struct mtd_info *mtd = ftl->trans->mtd;
|
||||
int ret;
|
||||
|
||||
--- a/drivers/mtd/ssfdc.c
|
||||
+++ b/drivers/mtd/ssfdc.c
|
||||
@@ -163,7 +163,7 @@ static int read_physical_sector(struct m
|
||||
/* Read redundancy area (wrapper to MTD_READ_OOB */
|
||||
static int read_raw_oob(struct mtd_info *mtd, loff_t offs, uint8_t *buf)
|
||||
{
|
||||
- struct mtd_oob_ops ops;
|
||||
+ struct mtd_oob_ops ops = { };
|
||||
int ret;
|
||||
|
||||
ops.mode = MTD_OPS_RAW;
|
||||
--- a/drivers/mtd/tests/nandbiterrs.c
|
||||
+++ b/drivers/mtd/tests/nandbiterrs.c
|
||||
@@ -99,7 +99,7 @@ static int write_page(int log)
|
||||
static int rewrite_page(int log)
|
||||
{
|
||||
int err = 0;
|
||||
- struct mtd_oob_ops ops;
|
||||
+ struct mtd_oob_ops ops = { };
|
||||
|
||||
if (log)
|
||||
pr_info("rewrite page\n");
|
||||
--- a/drivers/mtd/tests/oobtest.c
|
||||
+++ b/drivers/mtd/tests/oobtest.c
|
||||
@@ -56,7 +56,7 @@ static void do_vary_offset(void)
|
||||
static int write_eraseblock(int ebnum)
|
||||
{
|
||||
int i;
|
||||
- struct mtd_oob_ops ops;
|
||||
+ struct mtd_oob_ops ops = { };
|
||||
int err = 0;
|
||||
loff_t addr = (loff_t)ebnum * mtd->erasesize;
|
||||
|
||||
@@ -165,7 +165,7 @@ static size_t memffshow(loff_t addr, lof
|
||||
static int verify_eraseblock(int ebnum)
|
||||
{
|
||||
int i;
|
||||
- struct mtd_oob_ops ops;
|
||||
+ struct mtd_oob_ops ops = { };
|
||||
int err = 0;
|
||||
loff_t addr = (loff_t)ebnum * mtd->erasesize;
|
||||
size_t bitflips;
|
||||
@@ -260,7 +260,7 @@ static int verify_eraseblock(int ebnum)
|
||||
|
||||
static int verify_eraseblock_in_one_go(int ebnum)
|
||||
{
|
||||
- struct mtd_oob_ops ops;
|
||||
+ struct mtd_oob_ops ops = { };
|
||||
int err = 0;
|
||||
loff_t addr = (loff_t)ebnum * mtd->erasesize;
|
||||
size_t len = mtd->oobavail * pgcnt;
|
||||
@@ -338,7 +338,7 @@ static int __init mtd_oobtest_init(void)
|
||||
int err = 0;
|
||||
unsigned int i;
|
||||
uint64_t tmp;
|
||||
- struct mtd_oob_ops ops;
|
||||
+ struct mtd_oob_ops ops = { };
|
||||
loff_t addr = 0, addr0;
|
||||
|
||||
printk(KERN_INFO "\n");
|
||||
--- a/drivers/mtd/tests/readtest.c
|
||||
+++ b/drivers/mtd/tests/readtest.c
|
||||
@@ -47,7 +47,7 @@ static int read_eraseblock_by_page(int e
|
||||
err = ret;
|
||||
}
|
||||
if (mtd->oobsize) {
|
||||
- struct mtd_oob_ops ops;
|
||||
+ struct mtd_oob_ops ops = { };
|
||||
|
||||
ops.mode = MTD_OPS_PLACE_OOB;
|
||||
ops.len = 0;
|
||||
--- a/fs/jffs2/wbuf.c
|
||||
+++ b/fs/jffs2/wbuf.c
|
||||
@@ -1035,7 +1035,7 @@ int jffs2_check_oob_empty(struct jffs2_s
|
||||
{
|
||||
int i, ret;
|
||||
int cmlen = min_t(int, c->oobavail, OOB_CM_SIZE);
|
||||
- struct mtd_oob_ops ops;
|
||||
+ struct mtd_oob_ops ops = { };
|
||||
|
||||
ops.mode = MTD_OPS_AUTO_OOB;
|
||||
ops.ooblen = NR_OOB_SCAN_PAGES * c->oobavail;
|
||||
@@ -1076,7 +1076,7 @@ int jffs2_check_oob_empty(struct jffs2_s
|
||||
int jffs2_check_nand_cleanmarker(struct jffs2_sb_info *c,
|
||||
struct jffs2_eraseblock *jeb)
|
||||
{
|
||||
- struct mtd_oob_ops ops;
|
||||
+ struct mtd_oob_ops ops = { };
|
||||
int ret, cmlen = min_t(int, c->oobavail, OOB_CM_SIZE);
|
||||
|
||||
ops.mode = MTD_OPS_AUTO_OOB;
|
||||
@@ -1101,7 +1101,7 @@ int jffs2_write_nand_cleanmarker(struct
|
||||
struct jffs2_eraseblock *jeb)
|
||||
{
|
||||
int ret;
|
||||
- struct mtd_oob_ops ops;
|
||||
+ struct mtd_oob_ops ops = { };
|
||||
int cmlen = min_t(int, c->oobavail, OOB_CM_SIZE);
|
||||
|
||||
ops.mode = MTD_OPS_AUTO_OOB;
|
@ -0,0 +1,172 @@
|
||||
From 2ed18d818d1f7492172f8dd5904344c7d367e8ed Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Micha=C5=82=20K=C4=99pie=C5=84?= <kernel@kempniu.pl>
|
||||
Date: Wed, 29 Jun 2022 14:57:36 +0200
|
||||
Subject: [PATCH 3/4] mtd: add ECC error accounting for each read request
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
Extend struct mtd_req_stats with two new fields holding the number of
|
||||
corrected bitflips and uncorrectable errors detected during a read
|
||||
operation. This is a prerequisite for ultimately passing those counters
|
||||
to user space, where they can be useful to applications for making
|
||||
better-informed choices about moving data around.
|
||||
|
||||
Unlike 'max_bitflips' (which is set - in a common code path - to the
|
||||
return value of a function called while the MTD device's mutex is held),
|
||||
these counters have to be maintained in each MTD driver which defines
|
||||
the '_read_oob' callback because the statistics need to be calculated
|
||||
while the MTD device's mutex is held.
|
||||
|
||||
Suggested-by: Boris Brezillon <boris.brezillon@collabora.com>
|
||||
Signed-off-by: Michał Kępień <kernel@kempniu.pl>
|
||||
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
|
||||
Link: https://lore.kernel.org/linux-mtd/20220629125737.14418-4-kernel@kempniu.pl
|
||||
---
|
||||
drivers/mtd/devices/docg3.c | 8 ++++++++
|
||||
drivers/mtd/nand/onenand/onenand_base.c | 12 ++++++++++++
|
||||
drivers/mtd/nand/raw/nand_base.c | 10 ++++++++++
|
||||
drivers/mtd/nand/spi/core.c | 10 ++++++++++
|
||||
include/linux/mtd/mtd.h | 2 ++
|
||||
5 files changed, 42 insertions(+)
|
||||
|
||||
--- a/drivers/mtd/devices/docg3.c
|
||||
+++ b/drivers/mtd/devices/docg3.c
|
||||
@@ -871,6 +871,7 @@ static int doc_read_oob(struct mtd_info
|
||||
u8 *buf = ops->datbuf;
|
||||
size_t len, ooblen, nbdata, nboob;
|
||||
u8 hwecc[DOC_ECC_BCH_SIZE], eccconf1;
|
||||
+ struct mtd_ecc_stats old_stats;
|
||||
int max_bitflips = 0;
|
||||
|
||||
if (buf)
|
||||
@@ -895,6 +896,7 @@ static int doc_read_oob(struct mtd_info
|
||||
ret = 0;
|
||||
skip = from % DOC_LAYOUT_PAGE_SIZE;
|
||||
mutex_lock(&docg3->cascade->lock);
|
||||
+ old_stats = mtd->ecc_stats;
|
||||
while (ret >= 0 && (len > 0 || ooblen > 0)) {
|
||||
calc_block_sector(from - skip, &block0, &block1, &page, &ofs,
|
||||
docg3->reliable);
|
||||
@@ -966,6 +968,12 @@ static int doc_read_oob(struct mtd_info
|
||||
}
|
||||
|
||||
out:
|
||||
+ if (ops->stats) {
|
||||
+ ops->stats->uncorrectable_errors +=
|
||||
+ mtd->ecc_stats.failed - old_stats.failed;
|
||||
+ ops->stats->corrected_bitflips +=
|
||||
+ mtd->ecc_stats.corrected - old_stats.corrected;
|
||||
+ }
|
||||
mutex_unlock(&docg3->cascade->lock);
|
||||
return ret;
|
||||
err_in_read:
|
||||
--- a/drivers/mtd/nand/onenand/onenand_base.c
|
||||
+++ b/drivers/mtd/nand/onenand/onenand_base.c
|
||||
@@ -1440,6 +1440,7 @@ static int onenand_read_oob(struct mtd_i
|
||||
struct mtd_oob_ops *ops)
|
||||
{
|
||||
struct onenand_chip *this = mtd->priv;
|
||||
+ struct mtd_ecc_stats old_stats;
|
||||
int ret;
|
||||
|
||||
switch (ops->mode) {
|
||||
@@ -1453,12 +1454,23 @@ static int onenand_read_oob(struct mtd_i
|
||||
}
|
||||
|
||||
onenand_get_device(mtd, FL_READING);
|
||||
+
|
||||
+ old_stats = mtd->ecc_stats;
|
||||
+
|
||||
if (ops->datbuf)
|
||||
ret = ONENAND_IS_4KB_PAGE(this) ?
|
||||
onenand_mlc_read_ops_nolock(mtd, from, ops) :
|
||||
onenand_read_ops_nolock(mtd, from, ops);
|
||||
else
|
||||
ret = onenand_read_oob_nolock(mtd, from, ops);
|
||||
+
|
||||
+ if (ops->stats) {
|
||||
+ ops->stats->uncorrectable_errors +=
|
||||
+ mtd->ecc_stats.failed - old_stats.failed;
|
||||
+ ops->stats->corrected_bitflips +=
|
||||
+ mtd->ecc_stats.corrected - old_stats.corrected;
|
||||
+ }
|
||||
+
|
||||
onenand_release_device(mtd);
|
||||
|
||||
return ret;
|
||||
--- a/drivers/mtd/nand/raw/nand_base.c
|
||||
+++ b/drivers/mtd/nand/raw/nand_base.c
|
||||
@@ -3815,6 +3815,7 @@ static int nand_read_oob(struct mtd_info
|
||||
struct mtd_oob_ops *ops)
|
||||
{
|
||||
struct nand_chip *chip = mtd_to_nand(mtd);
|
||||
+ struct mtd_ecc_stats old_stats;
|
||||
int ret;
|
||||
|
||||
ops->retlen = 0;
|
||||
@@ -3826,11 +3827,20 @@ static int nand_read_oob(struct mtd_info
|
||||
|
||||
nand_get_device(chip);
|
||||
|
||||
+ old_stats = mtd->ecc_stats;
|
||||
+
|
||||
if (!ops->datbuf)
|
||||
ret = nand_do_read_oob(chip, from, ops);
|
||||
else
|
||||
ret = nand_do_read_ops(chip, from, ops);
|
||||
|
||||
+ if (ops->stats) {
|
||||
+ ops->stats->uncorrectable_errors +=
|
||||
+ mtd->ecc_stats.failed - old_stats.failed;
|
||||
+ ops->stats->corrected_bitflips +=
|
||||
+ mtd->ecc_stats.corrected - old_stats.corrected;
|
||||
+ }
|
||||
+
|
||||
nand_release_device(chip);
|
||||
return ret;
|
||||
}
|
||||
--- a/drivers/mtd/nand/spi/core.c
|
||||
+++ b/drivers/mtd/nand/spi/core.c
|
||||
@@ -629,6 +629,7 @@ static int spinand_mtd_read(struct mtd_i
|
||||
{
|
||||
struct spinand_device *spinand = mtd_to_spinand(mtd);
|
||||
struct nand_device *nand = mtd_to_nanddev(mtd);
|
||||
+ struct mtd_ecc_stats old_stats;
|
||||
unsigned int max_bitflips = 0;
|
||||
struct nand_io_iter iter;
|
||||
bool disable_ecc = false;
|
||||
@@ -640,6 +641,8 @@ static int spinand_mtd_read(struct mtd_i
|
||||
|
||||
mutex_lock(&spinand->lock);
|
||||
|
||||
+ old_stats = mtd->ecc_stats;
|
||||
+
|
||||
nanddev_io_for_each_page(nand, NAND_PAGE_READ, from, ops, &iter) {
|
||||
if (disable_ecc)
|
||||
iter.req.mode = MTD_OPS_RAW;
|
||||
@@ -662,6 +665,13 @@ static int spinand_mtd_read(struct mtd_i
|
||||
ops->oobretlen += iter.req.ooblen;
|
||||
}
|
||||
|
||||
+ if (ops->stats) {
|
||||
+ ops->stats->uncorrectable_errors +=
|
||||
+ mtd->ecc_stats.failed - old_stats.failed;
|
||||
+ ops->stats->corrected_bitflips +=
|
||||
+ mtd->ecc_stats.corrected - old_stats.corrected;
|
||||
+ }
|
||||
+
|
||||
mutex_unlock(&spinand->lock);
|
||||
|
||||
if (ecc_failed && !ret)
|
||||
--- a/include/linux/mtd/mtd.h
|
||||
+++ b/include/linux/mtd/mtd.h
|
||||
@@ -41,6 +41,8 @@ struct mtd_erase_region_info {
|
||||
};
|
||||
|
||||
struct mtd_req_stats {
|
||||
+ unsigned int uncorrectable_errors;
|
||||
+ unsigned int corrected_bitflips;
|
||||
unsigned int max_bitflips;
|
||||
};
|
||||
|
@ -0,0 +1,321 @@
|
||||
From 2c9745d36e04ac27161acd78514f647b9b587ad4 Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Micha=C5=82=20K=C4=99pie=C5=84?= <kernel@kempniu.pl>
|
||||
Date: Wed, 29 Jun 2022 14:57:37 +0200
|
||||
Subject: [PATCH 4/4] mtdchar: add MEMREAD ioctl
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
User-space applications making use of MTD devices via /dev/mtd*
|
||||
character devices currently have limited capabilities for reading data:
|
||||
|
||||
- only deprecated methods of accessing OOB layout information exist,
|
||||
|
||||
- there is no way to explicitly specify MTD operation mode to use; it
|
||||
is auto-selected based on the MTD file mode (MTD_FILE_MODE_*) set
|
||||
for the character device; in particular, this prevents using
|
||||
MTD_OPS_AUTO_OOB for reads,
|
||||
|
||||
- all existing user-space interfaces which cause mtd_read() or
|
||||
mtd_read_oob() to be called (via mtdchar_read() and
|
||||
mtdchar_read_oob(), respectively) return success even when those
|
||||
functions return -EUCLEAN or -EBADMSG; this renders user-space
|
||||
applications using these interfaces unaware of any corrected
|
||||
bitflips or uncorrectable ECC errors detected during reads.
|
||||
|
||||
Note that the existing MEMWRITE ioctl allows the MTD operation mode to
|
||||
be explicitly set, allowing user-space applications to write page data
|
||||
and OOB data without requiring them to know anything about the OOB
|
||||
layout of the MTD device they are writing to (MTD_OPS_AUTO_OOB). Also,
|
||||
the MEMWRITE ioctl does not mangle the return value of mtd_write_oob().
|
||||
|
||||
Add a new ioctl, MEMREAD, which addresses the above issues. It is
|
||||
intended to be a read-side counterpart of the existing MEMWRITE ioctl.
|
||||
Similarly to the latter, the read operation is performed in a loop which
|
||||
processes at most mtd->erasesize bytes in each iteration. This is done
|
||||
to prevent unbounded memory allocations caused by calling kmalloc() with
|
||||
the 'size' argument taken directly from the struct mtd_read_req provided
|
||||
by user space. However, the new ioctl is implemented so that the values
|
||||
it returns match those that would have been returned if just a single
|
||||
mtd_read_oob() call was issued to handle the entire read operation in
|
||||
one go.
|
||||
|
||||
Note that while just returning -EUCLEAN or -EBADMSG to user space would
|
||||
already be a valid and useful indication of the ECC algorithm detecting
|
||||
errors during a read operation, that signal would not be granular enough
|
||||
to cover all use cases. For example, knowing the maximum number of
|
||||
bitflips detected in a single ECC step during a read operation performed
|
||||
on a given page may be useful when dealing with an MTD partition whose
|
||||
ECC layout varies across pages (e.g. a partition consisting of a
|
||||
bootloader area using a "custom" ECC layout followed by data pages using
|
||||
a "standard" ECC layout). To address that, include ECC statistics in
|
||||
the structure returned to user space by the new MEMREAD ioctl.
|
||||
|
||||
Link: https://www.infradead.org/pipermail/linux-mtd/2016-April/067085.html
|
||||
|
||||
Suggested-by: Boris Brezillon <boris.brezillon@collabora.com>
|
||||
Signed-off-by: Michał Kępień <kernel@kempniu.pl>
|
||||
Acked-by: Richard Weinberger <richard@nod.at>
|
||||
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
|
||||
Link: https://lore.kernel.org/linux-mtd/20220629125737.14418-5-kernel@kempniu.pl
|
||||
---
|
||||
drivers/mtd/mtdchar.c | 139 +++++++++++++++++++++++++++++++++++++
|
||||
include/uapi/mtd/mtd-abi.h | 64 +++++++++++++++--
|
||||
2 files changed, 198 insertions(+), 5 deletions(-)
|
||||
|
||||
--- a/drivers/mtd/mtdchar.c
|
||||
+++ b/drivers/mtd/mtdchar.c
|
||||
@@ -621,6 +621,137 @@ static int mtdchar_write_ioctl(struct mt
|
||||
return ret;
|
||||
}
|
||||
|
||||
+static int mtdchar_read_ioctl(struct mtd_info *mtd,
|
||||
+ struct mtd_read_req __user *argp)
|
||||
+{
|
||||
+ struct mtd_info *master = mtd_get_master(mtd);
|
||||
+ struct mtd_read_req req;
|
||||
+ void __user *usr_data, *usr_oob;
|
||||
+ uint8_t *datbuf = NULL, *oobbuf = NULL;
|
||||
+ size_t datbuf_len, oobbuf_len;
|
||||
+ size_t orig_len, orig_ooblen;
|
||||
+ int ret = 0;
|
||||
+
|
||||
+ if (copy_from_user(&req, argp, sizeof(req)))
|
||||
+ return -EFAULT;
|
||||
+
|
||||
+ orig_len = req.len;
|
||||
+ orig_ooblen = req.ooblen;
|
||||
+
|
||||
+ usr_data = (void __user *)(uintptr_t)req.usr_data;
|
||||
+ usr_oob = (void __user *)(uintptr_t)req.usr_oob;
|
||||
+
|
||||
+ if (!master->_read_oob)
|
||||
+ return -EOPNOTSUPP;
|
||||
+
|
||||
+ if (!usr_data)
|
||||
+ req.len = 0;
|
||||
+
|
||||
+ if (!usr_oob)
|
||||
+ req.ooblen = 0;
|
||||
+
|
||||
+ req.ecc_stats.uncorrectable_errors = 0;
|
||||
+ req.ecc_stats.corrected_bitflips = 0;
|
||||
+ req.ecc_stats.max_bitflips = 0;
|
||||
+
|
||||
+ req.len &= 0xffffffff;
|
||||
+ req.ooblen &= 0xffffffff;
|
||||
+
|
||||
+ if (req.start + req.len > mtd->size) {
|
||||
+ ret = -EINVAL;
|
||||
+ goto out;
|
||||
+ }
|
||||
+
|
||||
+ datbuf_len = min_t(size_t, req.len, mtd->erasesize);
|
||||
+ if (datbuf_len > 0) {
|
||||
+ datbuf = kvmalloc(datbuf_len, GFP_KERNEL);
|
||||
+ if (!datbuf) {
|
||||
+ ret = -ENOMEM;
|
||||
+ goto out;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ oobbuf_len = min_t(size_t, req.ooblen, mtd->erasesize);
|
||||
+ if (oobbuf_len > 0) {
|
||||
+ oobbuf = kvmalloc(oobbuf_len, GFP_KERNEL);
|
||||
+ if (!oobbuf) {
|
||||
+ ret = -ENOMEM;
|
||||
+ goto out;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ while (req.len > 0 || (!usr_data && req.ooblen > 0)) {
|
||||
+ struct mtd_req_stats stats;
|
||||
+ struct mtd_oob_ops ops = {
|
||||
+ .mode = req.mode,
|
||||
+ .len = min_t(size_t, req.len, datbuf_len),
|
||||
+ .ooblen = min_t(size_t, req.ooblen, oobbuf_len),
|
||||
+ .datbuf = datbuf,
|
||||
+ .oobbuf = oobbuf,
|
||||
+ .stats = &stats,
|
||||
+ };
|
||||
+
|
||||
+ /*
|
||||
+ * Shorten non-page-aligned, eraseblock-sized reads so that the
|
||||
+ * read ends on an eraseblock boundary. This is necessary in
|
||||
+ * order to prevent OOB data for some pages from being
|
||||
+ * duplicated in the output of non-page-aligned reads requiring
|
||||
+ * multiple mtd_read_oob() calls to be completed.
|
||||
+ */
|
||||
+ if (ops.len == mtd->erasesize)
|
||||
+ ops.len -= mtd_mod_by_ws(req.start + ops.len, mtd);
|
||||
+
|
||||
+ ret = mtd_read_oob(mtd, (loff_t)req.start, &ops);
|
||||
+
|
||||
+ req.ecc_stats.uncorrectable_errors +=
|
||||
+ stats.uncorrectable_errors;
|
||||
+ req.ecc_stats.corrected_bitflips += stats.corrected_bitflips;
|
||||
+ req.ecc_stats.max_bitflips =
|
||||
+ max(req.ecc_stats.max_bitflips, stats.max_bitflips);
|
||||
+
|
||||
+ if (ret && !mtd_is_bitflip_or_eccerr(ret))
|
||||
+ break;
|
||||
+
|
||||
+ if (copy_to_user(usr_data, ops.datbuf, ops.retlen) ||
|
||||
+ copy_to_user(usr_oob, ops.oobbuf, ops.oobretlen)) {
|
||||
+ ret = -EFAULT;
|
||||
+ break;
|
||||
+ }
|
||||
+
|
||||
+ req.start += ops.retlen;
|
||||
+ req.len -= ops.retlen;
|
||||
+ usr_data += ops.retlen;
|
||||
+
|
||||
+ req.ooblen -= ops.oobretlen;
|
||||
+ usr_oob += ops.oobretlen;
|
||||
+ }
|
||||
+
|
||||
+ /*
|
||||
+ * As multiple iterations of the above loop (and therefore multiple
|
||||
+ * mtd_read_oob() calls) may be necessary to complete the read request,
|
||||
+ * adjust the final return code to ensure it accounts for all detected
|
||||
+ * ECC errors.
|
||||
+ */
|
||||
+ if (!ret || mtd_is_bitflip(ret)) {
|
||||
+ if (req.ecc_stats.uncorrectable_errors > 0)
|
||||
+ ret = -EBADMSG;
|
||||
+ else if (req.ecc_stats.corrected_bitflips > 0)
|
||||
+ ret = -EUCLEAN;
|
||||
+ }
|
||||
+
|
||||
+out:
|
||||
+ req.len = orig_len - req.len;
|
||||
+ req.ooblen = orig_ooblen - req.ooblen;
|
||||
+
|
||||
+ if (copy_to_user(argp, &req, sizeof(req)))
|
||||
+ ret = -EFAULT;
|
||||
+
|
||||
+ kvfree(datbuf);
|
||||
+ kvfree(oobbuf);
|
||||
+
|
||||
+ return ret;
|
||||
+}
|
||||
+
|
||||
static int mtdchar_ioctl(struct file *file, u_int cmd, u_long arg)
|
||||
{
|
||||
struct mtd_file_info *mfi = file->private_data;
|
||||
@@ -643,6 +774,7 @@ static int mtdchar_ioctl(struct file *fi
|
||||
case MEMGETINFO:
|
||||
case MEMREADOOB:
|
||||
case MEMREADOOB64:
|
||||
+ case MEMREAD:
|
||||
case MEMISLOCKED:
|
||||
case MEMGETOOBSEL:
|
||||
case MEMGETBADBLOCK:
|
||||
@@ -817,6 +949,13 @@ static int mtdchar_ioctl(struct file *fi
|
||||
break;
|
||||
}
|
||||
|
||||
+ case MEMREAD:
|
||||
+ {
|
||||
+ ret = mtdchar_read_ioctl(mtd,
|
||||
+ (struct mtd_read_req __user *)arg);
|
||||
+ break;
|
||||
+ }
|
||||
+
|
||||
case MEMLOCK:
|
||||
{
|
||||
struct erase_info_user einfo;
|
||||
--- a/include/uapi/mtd/mtd-abi.h
|
||||
+++ b/include/uapi/mtd/mtd-abi.h
|
||||
@@ -55,9 +55,9 @@ struct mtd_oob_buf64 {
|
||||
* @MTD_OPS_RAW: data are transferred as-is, with no error correction;
|
||||
* this mode implies %MTD_OPS_PLACE_OOB
|
||||
*
|
||||
- * These modes can be passed to ioctl(MEMWRITE) and are also used internally.
|
||||
- * See notes on "MTD file modes" for discussion on %MTD_OPS_RAW vs.
|
||||
- * %MTD_FILE_MODE_RAW.
|
||||
+ * These modes can be passed to ioctl(MEMWRITE) and ioctl(MEMREAD); they are
|
||||
+ * also used internally. See notes on "MTD file modes" for discussion on
|
||||
+ * %MTD_OPS_RAW vs. %MTD_FILE_MODE_RAW.
|
||||
*/
|
||||
enum {
|
||||
MTD_OPS_PLACE_OOB = 0,
|
||||
@@ -91,6 +91,53 @@ struct mtd_write_req {
|
||||
__u8 padding[7];
|
||||
};
|
||||
|
||||
+/**
|
||||
+ * struct mtd_read_req_ecc_stats - ECC statistics for a read operation
|
||||
+ *
|
||||
+ * @uncorrectable_errors: the number of uncorrectable errors that happened
|
||||
+ * during the read operation
|
||||
+ * @corrected_bitflips: the number of bitflips corrected during the read
|
||||
+ * operation
|
||||
+ * @max_bitflips: the maximum number of bitflips detected in any single ECC
|
||||
+ * step for the data read during the operation; this information
|
||||
+ * can be used to decide whether the data stored in a specific
|
||||
+ * region of the MTD device should be moved somewhere else to
|
||||
+ * avoid data loss.
|
||||
+ */
|
||||
+struct mtd_read_req_ecc_stats {
|
||||
+ __u32 uncorrectable_errors;
|
||||
+ __u32 corrected_bitflips;
|
||||
+ __u32 max_bitflips;
|
||||
+};
|
||||
+
|
||||
+/**
|
||||
+ * struct mtd_read_req - data structure for requesting a read operation
|
||||
+ *
|
||||
+ * @start: start address
|
||||
+ * @len: length of data buffer (only lower 32 bits are used)
|
||||
+ * @ooblen: length of OOB buffer (only lower 32 bits are used)
|
||||
+ * @usr_data: user-provided data buffer
|
||||
+ * @usr_oob: user-provided OOB buffer
|
||||
+ * @mode: MTD mode (see "MTD operation modes")
|
||||
+ * @padding: reserved, must be set to 0
|
||||
+ * @ecc_stats: ECC statistics for the read operation
|
||||
+ *
|
||||
+ * This structure supports ioctl(MEMREAD) operations, allowing data and/or OOB
|
||||
+ * reads in various modes. To read from OOB-only, set @usr_data == NULL, and to
|
||||
+ * read data-only, set @usr_oob == NULL. However, setting both @usr_data and
|
||||
+ * @usr_oob to NULL is not allowed.
|
||||
+ */
|
||||
+struct mtd_read_req {
|
||||
+ __u64 start;
|
||||
+ __u64 len;
|
||||
+ __u64 ooblen;
|
||||
+ __u64 usr_data;
|
||||
+ __u64 usr_oob;
|
||||
+ __u8 mode;
|
||||
+ __u8 padding[7];
|
||||
+ struct mtd_read_req_ecc_stats ecc_stats;
|
||||
+};
|
||||
+
|
||||
#define MTD_ABSENT 0
|
||||
#define MTD_RAM 1
|
||||
#define MTD_ROM 2
|
||||
@@ -207,6 +254,12 @@ struct otp_info {
|
||||
#define MEMWRITE _IOWR('M', 24, struct mtd_write_req)
|
||||
/* Erase a given range of user data (must be in mode %MTD_FILE_MODE_OTP_USER) */
|
||||
#define OTPERASE _IOW('M', 25, struct otp_info)
|
||||
+/*
|
||||
+ * Most generic read interface; can read in-band and/or out-of-band in various
|
||||
+ * modes (see "struct mtd_read_req"). This ioctl is not supported for flashes
|
||||
+ * without OOB, e.g., NOR flash.
|
||||
+ */
|
||||
+#define MEMREAD _IOWR('M', 26, struct mtd_read_req)
|
||||
|
||||
/*
|
||||
* Obsolete legacy interface. Keep it in order not to break userspace
|
||||
@@ -270,8 +323,9 @@ struct mtd_ecc_stats {
|
||||
* Note: %MTD_FILE_MODE_RAW provides the same functionality as %MTD_OPS_RAW -
|
||||
* raw access to the flash, without error correction or autoplacement schemes.
|
||||
* Wherever possible, the MTD_OPS_* mode will override the MTD_FILE_MODE_* mode
|
||||
- * (e.g., when using ioctl(MEMWRITE)), but in some cases, the MTD_FILE_MODE is
|
||||
- * used out of necessity (e.g., `write()', ioctl(MEMWRITEOOB64)).
|
||||
+ * (e.g., when using ioctl(MEMWRITE) or ioctl(MEMREAD)), but in some cases, the
|
||||
+ * MTD_FILE_MODE is used out of necessity (e.g., `write()',
|
||||
+ * ioctl(MEMWRITEOOB64)).
|
||||
*/
|
||||
enum mtd_file_modes {
|
||||
MTD_FILE_MODE_NORMAL = MTD_OTP_OFF,
|
@ -0,0 +1,35 @@
|
||||
From ebed787a0becb9354f0a23620a5130cccd6c730c Mon Sep 17 00:00:00 2001
|
||||
From: Daniel Golle <daniel@makrotopia.org>
|
||||
Date: Thu, 19 Jan 2023 03:45:43 +0000
|
||||
Subject: [PATCH] mtd: spinand: macronix: use scratch buffer for DMA operation
|
||||
|
||||
The mx35lf1ge4ab_get_eccsr() function uses an SPI DMA operation to
|
||||
read the eccsr, hence the buffer should not be on stack. Since commit
|
||||
380583227c0c7f ("spi: spi-mem: Add extra sanity checks on the op param")
|
||||
the kernel emmits a warning and blocks such operations.
|
||||
|
||||
Use the scratch buffer to get eccsr instead of trying to directly read
|
||||
into a stack-allocated variable.
|
||||
|
||||
Signed-off-by: Daniel Golle <daniel@makrotopia.org>
|
||||
Reviewed-by: Dhruva Gole <d-gole@ti.com>
|
||||
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
|
||||
Link: https://lore.kernel.org/linux-mtd/Y8i85zM0u4XdM46z@makrotopia.org
|
||||
---
|
||||
drivers/mtd/nand/spi/macronix.c | 3 ++-
|
||||
1 file changed, 2 insertions(+), 1 deletion(-)
|
||||
|
||||
--- a/drivers/mtd/nand/spi/macronix.c
|
||||
+++ b/drivers/mtd/nand/spi/macronix.c
|
||||
@@ -83,9 +83,10 @@ static int mx35lf1ge4ab_ecc_get_status(s
|
||||
* in order to avoid forcing the wear-leveling layer to move
|
||||
* data around if it's not necessary.
|
||||
*/
|
||||
- if (mx35lf1ge4ab_get_eccsr(spinand, &eccsr))
|
||||
+ if (mx35lf1ge4ab_get_eccsr(spinand, spinand->scratchbuf))
|
||||
return nanddev_get_ecc_conf(nand)->strength;
|
||||
|
||||
+ eccsr = *spinand->scratchbuf;
|
||||
if (WARN_ON(eccsr > nanddev_get_ecc_conf(nand)->strength ||
|
||||
!eccsr))
|
||||
return nanddev_get_ecc_conf(nand)->strength;
|
@ -0,0 +1,47 @@
|
||||
From 281f7a6c1a33fffcde32001bacbb4f672140fbf9 Mon Sep 17 00:00:00 2001
|
||||
From: Michael Walle <michael@walle.cc>
|
||||
Date: Wed, 8 Mar 2023 09:20:21 +0100
|
||||
Subject: [PATCH] mtd: core: prepare mtd_otp_nvmem_add() to handle
|
||||
-EPROBE_DEFER
|
||||
|
||||
NVMEM soon will get the ability for nvmem layouts and these might
|
||||
not be ready when nvmem_register() is called and thus it might
|
||||
return -EPROBE_DEFER. Don't print the error message in this case.
|
||||
|
||||
Signed-off-by: Michael Walle <michael@walle.cc>
|
||||
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
|
||||
Link: https://lore.kernel.org/linux-mtd/20230308082021.870459-4-michael@walle.cc
|
||||
---
|
||||
drivers/mtd/mtdcore.c | 7 +++----
|
||||
1 file changed, 3 insertions(+), 4 deletions(-)
|
||||
|
||||
--- a/drivers/mtd/mtdcore.c
|
||||
+++ b/drivers/mtd/mtdcore.c
|
||||
@@ -960,8 +960,8 @@ static int mtd_otp_nvmem_add(struct mtd_
|
||||
nvmem = mtd_otp_nvmem_register(mtd, "user-otp", size,
|
||||
mtd_nvmem_user_otp_reg_read);
|
||||
if (IS_ERR(nvmem)) {
|
||||
- dev_err(dev, "Failed to register OTP NVMEM device\n");
|
||||
- return PTR_ERR(nvmem);
|
||||
+ err = PTR_ERR(nvmem);
|
||||
+ goto err;
|
||||
}
|
||||
mtd->otp_user_nvmem = nvmem;
|
||||
}
|
||||
@@ -978,7 +978,6 @@ static int mtd_otp_nvmem_add(struct mtd_
|
||||
nvmem = mtd_otp_nvmem_register(mtd, "factory-otp", size,
|
||||
mtd_nvmem_fact_otp_reg_read);
|
||||
if (IS_ERR(nvmem)) {
|
||||
- dev_err(dev, "Failed to register OTP NVMEM device\n");
|
||||
err = PTR_ERR(nvmem);
|
||||
goto err;
|
||||
}
|
||||
@@ -991,7 +990,7 @@ static int mtd_otp_nvmem_add(struct mtd_
|
||||
err:
|
||||
if (mtd->otp_user_nvmem)
|
||||
nvmem_unregister(mtd->otp_user_nvmem);
|
||||
- return err;
|
||||
+ return dev_err_probe(dev, err, "Failed to register OTP NVMEM device\n");
|
||||
}
|
||||
|
||||
/**
|
@ -0,0 +1,165 @@
|
||||
From 8610037e8106b48c79cfe0afb92b2b2466e51c3d Mon Sep 17 00:00:00 2001
|
||||
From: Joe Damato <jdamato@fastly.com>
|
||||
Date: Tue, 1 Mar 2022 23:55:47 -0800
|
||||
Subject: [PATCH] page_pool: Add allocation stats
|
||||
|
||||
Add per-pool statistics counters for the allocation path of a page pool.
|
||||
These stats are incremented in softirq context, so no locking or per-cpu
|
||||
variables are needed.
|
||||
|
||||
This code is disabled by default and a kernel config option is provided for
|
||||
users who wish to enable them.
|
||||
|
||||
The statistics added are:
|
||||
- fast: successful fast path allocations
|
||||
- slow: slow path order-0 allocations
|
||||
- slow_high_order: slow path high order allocations
|
||||
- empty: ptr ring is empty, so a slow path allocation was forced.
|
||||
- refill: an allocation which triggered a refill of the cache
|
||||
- waive: pages obtained from the ptr ring that cannot be added to
|
||||
the cache due to a NUMA mismatch.
|
||||
|
||||
Signed-off-by: Joe Damato <jdamato@fastly.com>
|
||||
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
|
||||
Reviewed-by: Ilias Apalodimas <ilias.apalodimas@linaro.org>
|
||||
Signed-off-by: David S. Miller <davem@davemloft.net>
|
||||
---
|
||||
include/net/page_pool.h | 18 ++++++++++++++++++
|
||||
net/Kconfig | 13 +++++++++++++
|
||||
net/core/page_pool.c | 24 ++++++++++++++++++++----
|
||||
3 files changed, 51 insertions(+), 4 deletions(-)
|
||||
|
||||
--- a/include/net/page_pool.h
|
||||
+++ b/include/net/page_pool.h
|
||||
@@ -82,6 +82,19 @@ struct page_pool_params {
|
||||
unsigned int offset; /* DMA addr offset */
|
||||
};
|
||||
|
||||
+#ifdef CONFIG_PAGE_POOL_STATS
|
||||
+struct page_pool_alloc_stats {
|
||||
+ u64 fast; /* fast path allocations */
|
||||
+ u64 slow; /* slow-path order 0 allocations */
|
||||
+ u64 slow_high_order; /* slow-path high order allocations */
|
||||
+ u64 empty; /* failed refills due to empty ptr ring, forcing
|
||||
+ * slow path allocation
|
||||
+ */
|
||||
+ u64 refill; /* allocations via successful refill */
|
||||
+ u64 waive; /* failed refills due to numa zone mismatch */
|
||||
+};
|
||||
+#endif
|
||||
+
|
||||
struct page_pool {
|
||||
struct page_pool_params p;
|
||||
|
||||
@@ -132,6 +145,11 @@ struct page_pool {
|
||||
refcount_t user_cnt;
|
||||
|
||||
u64 destroy_cnt;
|
||||
+
|
||||
+#ifdef CONFIG_PAGE_POOL_STATS
|
||||
+ /* these stats are incremented while in softirq context */
|
||||
+ struct page_pool_alloc_stats alloc_stats;
|
||||
+#endif
|
||||
};
|
||||
|
||||
struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp);
|
||||
--- a/net/Kconfig
|
||||
+++ b/net/Kconfig
|
||||
@@ -434,6 +434,19 @@ config NET_DEVLINK
|
||||
config PAGE_POOL
|
||||
bool
|
||||
|
||||
+config PAGE_POOL_STATS
|
||||
+ default n
|
||||
+ bool "Page pool stats"
|
||||
+ depends on PAGE_POOL
|
||||
+ help
|
||||
+ Enable page pool statistics to track page allocation and recycling
|
||||
+ in page pools. This option incurs additional CPU cost in allocation
|
||||
+ and recycle paths and additional memory cost to store the statistics.
|
||||
+ These statistics are only available if this option is enabled and if
|
||||
+ the driver using the page pool supports exporting this data.
|
||||
+
|
||||
+ If unsure, say N.
|
||||
+
|
||||
config FAILOVER
|
||||
tristate "Generic failover module"
|
||||
help
|
||||
--- a/net/core/page_pool.c
|
||||
+++ b/net/core/page_pool.c
|
||||
@@ -26,6 +26,13 @@
|
||||
|
||||
#define BIAS_MAX LONG_MAX
|
||||
|
||||
+#ifdef CONFIG_PAGE_POOL_STATS
|
||||
+/* alloc_stat_inc is intended to be used in softirq context */
|
||||
+#define alloc_stat_inc(pool, __stat) (pool->alloc_stats.__stat++)
|
||||
+#else
|
||||
+#define alloc_stat_inc(pool, __stat)
|
||||
+#endif
|
||||
+
|
||||
static int page_pool_init(struct page_pool *pool,
|
||||
const struct page_pool_params *params)
|
||||
{
|
||||
@@ -117,8 +124,10 @@ static struct page *page_pool_refill_all
|
||||
int pref_nid; /* preferred NUMA node */
|
||||
|
||||
/* Quicker fallback, avoid locks when ring is empty */
|
||||
- if (__ptr_ring_empty(r))
|
||||
+ if (__ptr_ring_empty(r)) {
|
||||
+ alloc_stat_inc(pool, empty);
|
||||
return NULL;
|
||||
+ }
|
||||
|
||||
/* Softirq guarantee CPU and thus NUMA node is stable. This,
|
||||
* assumes CPU refilling driver RX-ring will also run RX-NAPI.
|
||||
@@ -148,14 +157,17 @@ static struct page *page_pool_refill_all
|
||||
* This limit stress on page buddy alloactor.
|
||||
*/
|
||||
page_pool_return_page(pool, page);
|
||||
+ alloc_stat_inc(pool, waive);
|
||||
page = NULL;
|
||||
break;
|
||||
}
|
||||
} while (pool->alloc.count < PP_ALLOC_CACHE_REFILL);
|
||||
|
||||
/* Return last page */
|
||||
- if (likely(pool->alloc.count > 0))
|
||||
+ if (likely(pool->alloc.count > 0)) {
|
||||
page = pool->alloc.cache[--pool->alloc.count];
|
||||
+ alloc_stat_inc(pool, refill);
|
||||
+ }
|
||||
|
||||
spin_unlock(&r->consumer_lock);
|
||||
return page;
|
||||
@@ -170,6 +182,7 @@ static struct page *__page_pool_get_cach
|
||||
if (likely(pool->alloc.count)) {
|
||||
/* Fast-path */
|
||||
page = pool->alloc.cache[--pool->alloc.count];
|
||||
+ alloc_stat_inc(pool, fast);
|
||||
} else {
|
||||
page = page_pool_refill_alloc_cache(pool);
|
||||
}
|
||||
@@ -241,6 +254,7 @@ static struct page *__page_pool_alloc_pa
|
||||
return NULL;
|
||||
}
|
||||
|
||||
+ alloc_stat_inc(pool, slow_high_order);
|
||||
page_pool_set_pp_info(pool, page);
|
||||
|
||||
/* Track how many pages are held 'in-flight' */
|
||||
@@ -295,10 +309,12 @@ static struct page *__page_pool_alloc_pa
|
||||
}
|
||||
|
||||
/* Return last page */
|
||||
- if (likely(pool->alloc.count > 0))
|
||||
+ if (likely(pool->alloc.count > 0)) {
|
||||
page = pool->alloc.cache[--pool->alloc.count];
|
||||
- else
|
||||
+ alloc_stat_inc(pool, slow);
|
||||
+ } else {
|
||||
page = NULL;
|
||||
+ }
|
||||
|
||||
/* When page just alloc'ed is should/must have refcnt 1. */
|
||||
return page;
|
@ -0,0 +1,140 @@
|
||||
From ad6fa1e1ab1b8164f1ba296b1b4dc556a483bcad Mon Sep 17 00:00:00 2001
|
||||
From: Joe Damato <jdamato@fastly.com>
|
||||
Date: Tue, 1 Mar 2022 23:55:48 -0800
|
||||
Subject: [PATCH 2/3] page_pool: Add recycle stats
|
||||
|
||||
Add per-cpu stats tracking page pool recycling events:
|
||||
- cached: recycling placed page in the page pool cache
|
||||
- cache_full: page pool cache was full
|
||||
- ring: page placed into the ptr ring
|
||||
- ring_full: page released from page pool because the ptr ring was full
|
||||
- released_refcnt: page released (and not recycled) because refcnt > 1
|
||||
|
||||
Signed-off-by: Joe Damato <jdamato@fastly.com>
|
||||
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
|
||||
Reviewed-by: Ilias Apalodimas <ilias.apalodimas@linaro.org>
|
||||
Signed-off-by: David S. Miller <davem@davemloft.net>
|
||||
---
|
||||
include/net/page_pool.h | 16 ++++++++++++++++
|
||||
net/core/page_pool.c | 30 ++++++++++++++++++++++++++++--
|
||||
2 files changed, 44 insertions(+), 2 deletions(-)
|
||||
|
||||
--- a/include/net/page_pool.h
|
||||
+++ b/include/net/page_pool.h
|
||||
@@ -93,6 +93,18 @@ struct page_pool_alloc_stats {
|
||||
u64 refill; /* allocations via successful refill */
|
||||
u64 waive; /* failed refills due to numa zone mismatch */
|
||||
};
|
||||
+
|
||||
+struct page_pool_recycle_stats {
|
||||
+ u64 cached; /* recycling placed page in the cache. */
|
||||
+ u64 cache_full; /* cache was full */
|
||||
+ u64 ring; /* recycling placed page back into ptr ring */
|
||||
+ u64 ring_full; /* page was released from page-pool because
|
||||
+ * PTR ring was full.
|
||||
+ */
|
||||
+ u64 released_refcnt; /* page released because of elevated
|
||||
+ * refcnt
|
||||
+ */
|
||||
+};
|
||||
#endif
|
||||
|
||||
struct page_pool {
|
||||
@@ -136,6 +148,10 @@ struct page_pool {
|
||||
*/
|
||||
struct ptr_ring ring;
|
||||
|
||||
+#ifdef CONFIG_PAGE_POOL_STATS
|
||||
+ /* recycle stats are per-cpu to avoid locking */
|
||||
+ struct page_pool_recycle_stats __percpu *recycle_stats;
|
||||
+#endif
|
||||
atomic_t pages_state_release_cnt;
|
||||
|
||||
/* A page_pool is strictly tied to a single RX-queue being
|
||||
--- a/net/core/page_pool.c
|
||||
+++ b/net/core/page_pool.c
|
||||
@@ -29,8 +29,15 @@
|
||||
#ifdef CONFIG_PAGE_POOL_STATS
|
||||
/* alloc_stat_inc is intended to be used in softirq context */
|
||||
#define alloc_stat_inc(pool, __stat) (pool->alloc_stats.__stat++)
|
||||
+/* recycle_stat_inc is safe to use when preemption is possible. */
|
||||
+#define recycle_stat_inc(pool, __stat) \
|
||||
+ do { \
|
||||
+ struct page_pool_recycle_stats __percpu *s = pool->recycle_stats; \
|
||||
+ this_cpu_inc(s->__stat); \
|
||||
+ } while (0)
|
||||
#else
|
||||
#define alloc_stat_inc(pool, __stat)
|
||||
+#define recycle_stat_inc(pool, __stat)
|
||||
#endif
|
||||
|
||||
static int page_pool_init(struct page_pool *pool,
|
||||
@@ -80,6 +87,12 @@ static int page_pool_init(struct page_po
|
||||
pool->p.flags & PP_FLAG_PAGE_FRAG)
|
||||
return -EINVAL;
|
||||
|
||||
+#ifdef CONFIG_PAGE_POOL_STATS
|
||||
+ pool->recycle_stats = alloc_percpu(struct page_pool_recycle_stats);
|
||||
+ if (!pool->recycle_stats)
|
||||
+ return -ENOMEM;
|
||||
+#endif
|
||||
+
|
||||
if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0)
|
||||
return -ENOMEM;
|
||||
|
||||
@@ -412,7 +425,12 @@ static bool page_pool_recycle_in_ring(st
|
||||
else
|
||||
ret = ptr_ring_produce_bh(&pool->ring, page);
|
||||
|
||||
- return (ret == 0) ? true : false;
|
||||
+ if (!ret) {
|
||||
+ recycle_stat_inc(pool, ring);
|
||||
+ return true;
|
||||
+ }
|
||||
+
|
||||
+ return false;
|
||||
}
|
||||
|
||||
/* Only allow direct recycling in special circumstances, into the
|
||||
@@ -423,11 +441,14 @@ static bool page_pool_recycle_in_ring(st
|
||||
static bool page_pool_recycle_in_cache(struct page *page,
|
||||
struct page_pool *pool)
|
||||
{
|
||||
- if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE))
|
||||
+ if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE)) {
|
||||
+ recycle_stat_inc(pool, cache_full);
|
||||
return false;
|
||||
+ }
|
||||
|
||||
/* Caller MUST have verified/know (page_ref_count(page) == 1) */
|
||||
pool->alloc.cache[pool->alloc.count++] = page;
|
||||
+ recycle_stat_inc(pool, cached);
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -482,6 +503,7 @@ __page_pool_put_page(struct page_pool *p
|
||||
* doing refcnt based recycle tricks, meaning another process
|
||||
* will be invoking put_page.
|
||||
*/
|
||||
+ recycle_stat_inc(pool, released_refcnt);
|
||||
/* Do not replace this with page_pool_return_page() */
|
||||
page_pool_release_page(pool, page);
|
||||
put_page(page);
|
||||
@@ -495,6 +517,7 @@ void page_pool_put_page(struct page_pool
|
||||
page = __page_pool_put_page(pool, page, dma_sync_size, allow_direct);
|
||||
if (page && !page_pool_recycle_in_ring(pool, page)) {
|
||||
/* Cache full, fallback to free pages */
|
||||
+ recycle_stat_inc(pool, ring_full);
|
||||
page_pool_return_page(pool, page);
|
||||
}
|
||||
}
|
||||
@@ -641,6 +664,9 @@ static void page_pool_free(struct page_p
|
||||
if (pool->p.flags & PP_FLAG_DMA_MAP)
|
||||
put_device(pool->p.dev);
|
||||
|
||||
+#ifdef CONFIG_PAGE_POOL_STATS
|
||||
+ free_percpu(pool->recycle_stats);
|
||||
+#endif
|
||||
kfree(pool);
|
||||
}
|
||||
|
@ -0,0 +1,77 @@
|
||||
From 6b95e3388b1ea0ca63500c5a6e39162dbf828433 Mon Sep 17 00:00:00 2001
|
||||
From: Joe Damato <jdamato@fastly.com>
|
||||
Date: Tue, 1 Mar 2022 23:55:49 -0800
|
||||
Subject: [PATCH 3/3] page_pool: Add function to batch and return stats
|
||||
|
||||
Adds a function page_pool_get_stats which can be used by drivers to obtain
|
||||
stats for a specified page_pool.
|
||||
|
||||
Signed-off-by: Joe Damato <jdamato@fastly.com>
|
||||
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
|
||||
Reviewed-by: Ilias Apalodimas <ilias.apalodimas@linaro.org>
|
||||
Signed-off-by: David S. Miller <davem@davemloft.net>
|
||||
---
|
||||
include/net/page_pool.h | 17 +++++++++++++++++
|
||||
net/core/page_pool.c | 25 +++++++++++++++++++++++++
|
||||
2 files changed, 42 insertions(+)
|
||||
|
||||
--- a/include/net/page_pool.h
|
||||
+++ b/include/net/page_pool.h
|
||||
@@ -105,6 +105,23 @@ struct page_pool_recycle_stats {
|
||||
* refcnt
|
||||
*/
|
||||
};
|
||||
+
|
||||
+/* This struct wraps the above stats structs so users of the
|
||||
+ * page_pool_get_stats API can pass a single argument when requesting the
|
||||
+ * stats for the page pool.
|
||||
+ */
|
||||
+struct page_pool_stats {
|
||||
+ struct page_pool_alloc_stats alloc_stats;
|
||||
+ struct page_pool_recycle_stats recycle_stats;
|
||||
+};
|
||||
+
|
||||
+/*
|
||||
+ * Drivers that wish to harvest page pool stats and report them to users
|
||||
+ * (perhaps via ethtool, debugfs, or another mechanism) can allocate a
|
||||
+ * struct page_pool_stats call page_pool_get_stats to get stats for the specified pool.
|
||||
+ */
|
||||
+bool page_pool_get_stats(struct page_pool *pool,
|
||||
+ struct page_pool_stats *stats);
|
||||
#endif
|
||||
|
||||
struct page_pool {
|
||||
--- a/net/core/page_pool.c
|
||||
+++ b/net/core/page_pool.c
|
||||
@@ -35,6 +35,31 @@
|
||||
struct page_pool_recycle_stats __percpu *s = pool->recycle_stats; \
|
||||
this_cpu_inc(s->__stat); \
|
||||
} while (0)
|
||||
+
|
||||
+bool page_pool_get_stats(struct page_pool *pool,
|
||||
+ struct page_pool_stats *stats)
|
||||
+{
|
||||
+ int cpu = 0;
|
||||
+
|
||||
+ if (!stats)
|
||||
+ return false;
|
||||
+
|
||||
+ memcpy(&stats->alloc_stats, &pool->alloc_stats, sizeof(pool->alloc_stats));
|
||||
+
|
||||
+ for_each_possible_cpu(cpu) {
|
||||
+ const struct page_pool_recycle_stats *pcpu =
|
||||
+ per_cpu_ptr(pool->recycle_stats, cpu);
|
||||
+
|
||||
+ stats->recycle_stats.cached += pcpu->cached;
|
||||
+ stats->recycle_stats.cache_full += pcpu->cache_full;
|
||||
+ stats->recycle_stats.ring += pcpu->ring;
|
||||
+ stats->recycle_stats.ring_full += pcpu->ring_full;
|
||||
+ stats->recycle_stats.released_refcnt += pcpu->released_refcnt;
|
||||
+ }
|
||||
+
|
||||
+ return true;
|
||||
+}
|
||||
+EXPORT_SYMBOL(page_pool_get_stats);
|
||||
#else
|
||||
#define alloc_stat_inc(pool, __stat)
|
||||
#define recycle_stat_inc(pool, __stat)
|
@ -0,0 +1,55 @@
|
||||
From 590032a4d2133ecc10d3078a8db1d85a4842f12c Mon Sep 17 00:00:00 2001
|
||||
From: Lorenzo Bianconi <lorenzo@kernel.org>
|
||||
Date: Mon, 11 Apr 2022 16:05:26 +0200
|
||||
Subject: [PATCH] page_pool: Add recycle stats to page_pool_put_page_bulk
|
||||
|
||||
Add missing recycle stats to page_pool_put_page_bulk routine.
|
||||
|
||||
Reviewed-by: Joe Damato <jdamato@fastly.com>
|
||||
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
|
||||
Reviewed-by: Ilias Apalodimas <ilias.apalodimas@linaro.org>
|
||||
Link: https://lore.kernel.org/r/3712178b51c007cfaed910ea80e68f00c916b1fa.1649685634.git.lorenzo@kernel.org
|
||||
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
|
||||
---
|
||||
net/core/page_pool.c | 15 +++++++++++++--
|
||||
1 file changed, 13 insertions(+), 2 deletions(-)
|
||||
|
||||
--- a/net/core/page_pool.c
|
||||
+++ b/net/core/page_pool.c
|
||||
@@ -36,6 +36,12 @@
|
||||
this_cpu_inc(s->__stat); \
|
||||
} while (0)
|
||||
|
||||
+#define recycle_stat_add(pool, __stat, val) \
|
||||
+ do { \
|
||||
+ struct page_pool_recycle_stats __percpu *s = pool->recycle_stats; \
|
||||
+ this_cpu_add(s->__stat, val); \
|
||||
+ } while (0)
|
||||
+
|
||||
bool page_pool_get_stats(struct page_pool *pool,
|
||||
struct page_pool_stats *stats)
|
||||
{
|
||||
@@ -63,6 +69,7 @@ EXPORT_SYMBOL(page_pool_get_stats);
|
||||
#else
|
||||
#define alloc_stat_inc(pool, __stat)
|
||||
#define recycle_stat_inc(pool, __stat)
|
||||
+#define recycle_stat_add(pool, __stat, val)
|
||||
#endif
|
||||
|
||||
static int page_pool_init(struct page_pool *pool,
|
||||
@@ -569,9 +576,13 @@ void page_pool_put_page_bulk(struct page
|
||||
/* Bulk producer into ptr_ring page_pool cache */
|
||||
page_pool_ring_lock(pool);
|
||||
for (i = 0; i < bulk_len; i++) {
|
||||
- if (__ptr_ring_produce(&pool->ring, data[i]))
|
||||
- break; /* ring full */
|
||||
+ if (__ptr_ring_produce(&pool->ring, data[i])) {
|
||||
+ /* ring full */
|
||||
+ recycle_stat_inc(pool, ring_full);
|
||||
+ break;
|
||||
+ }
|
||||
}
|
||||
+ recycle_stat_add(pool, ring, i);
|
||||
page_pool_ring_unlock(pool);
|
||||
|
||||
/* Hopefully all pages was return into ptr_ring */
|
@ -0,0 +1,147 @@
|
||||
From f3c5264f452a5b0ac1de1f2f657efbabdea3c76a Mon Sep 17 00:00:00 2001
|
||||
From: Lorenzo Bianconi <lorenzo@kernel.org>
|
||||
Date: Tue, 12 Apr 2022 18:31:58 +0200
|
||||
Subject: [PATCH] net: page_pool: introduce ethtool stats
|
||||
|
||||
Introduce page_pool APIs to report stats through ethtool and reduce
|
||||
duplicated code in each driver.
|
||||
|
||||
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
|
||||
Reviewed-by: Jakub Kicinski <kuba@kernel.org>
|
||||
Reviewed-by: Ilias Apalodimas <ilias.apalodimas@linaro.org>
|
||||
Signed-off-by: David S. Miller <davem@davemloft.net>
|
||||
---
|
||||
include/net/page_pool.h | 21 ++++++++++++++
|
||||
net/core/page_pool.c | 63 ++++++++++++++++++++++++++++++++++++++++-
|
||||
2 files changed, 83 insertions(+), 1 deletion(-)
|
||||
|
||||
--- a/include/net/page_pool.h
|
||||
+++ b/include/net/page_pool.h
|
||||
@@ -115,6 +115,10 @@ struct page_pool_stats {
|
||||
struct page_pool_recycle_stats recycle_stats;
|
||||
};
|
||||
|
||||
+int page_pool_ethtool_stats_get_count(void);
|
||||
+u8 *page_pool_ethtool_stats_get_strings(u8 *data);
|
||||
+u64 *page_pool_ethtool_stats_get(u64 *data, void *stats);
|
||||
+
|
||||
/*
|
||||
* Drivers that wish to harvest page pool stats and report them to users
|
||||
* (perhaps via ethtool, debugfs, or another mechanism) can allocate a
|
||||
@@ -122,6 +126,23 @@ struct page_pool_stats {
|
||||
*/
|
||||
bool page_pool_get_stats(struct page_pool *pool,
|
||||
struct page_pool_stats *stats);
|
||||
+#else
|
||||
+
|
||||
+static inline int page_pool_ethtool_stats_get_count(void)
|
||||
+{
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
+static inline u8 *page_pool_ethtool_stats_get_strings(u8 *data)
|
||||
+{
|
||||
+ return data;
|
||||
+}
|
||||
+
|
||||
+static inline u64 *page_pool_ethtool_stats_get(u64 *data, void *stats)
|
||||
+{
|
||||
+ return data;
|
||||
+}
|
||||
+
|
||||
#endif
|
||||
|
||||
struct page_pool {
|
||||
--- a/net/core/page_pool.c
|
||||
+++ b/net/core/page_pool.c
|
||||
@@ -18,6 +18,7 @@
|
||||
#include <linux/page-flags.h>
|
||||
#include <linux/mm.h> /* for __put_page() */
|
||||
#include <linux/poison.h>
|
||||
+#include <linux/ethtool.h>
|
||||
|
||||
#include <trace/events/page_pool.h>
|
||||
|
||||
@@ -42,6 +43,20 @@
|
||||
this_cpu_add(s->__stat, val); \
|
||||
} while (0)
|
||||
|
||||
+static const char pp_stats[][ETH_GSTRING_LEN] = {
|
||||
+ "rx_pp_alloc_fast",
|
||||
+ "rx_pp_alloc_slow",
|
||||
+ "rx_pp_alloc_slow_ho",
|
||||
+ "rx_pp_alloc_empty",
|
||||
+ "rx_pp_alloc_refill",
|
||||
+ "rx_pp_alloc_waive",
|
||||
+ "rx_pp_recycle_cached",
|
||||
+ "rx_pp_recycle_cache_full",
|
||||
+ "rx_pp_recycle_ring",
|
||||
+ "rx_pp_recycle_ring_full",
|
||||
+ "rx_pp_recycle_released_ref",
|
||||
+};
|
||||
+
|
||||
bool page_pool_get_stats(struct page_pool *pool,
|
||||
struct page_pool_stats *stats)
|
||||
{
|
||||
@@ -50,7 +65,13 @@ bool page_pool_get_stats(struct page_poo
|
||||
if (!stats)
|
||||
return false;
|
||||
|
||||
- memcpy(&stats->alloc_stats, &pool->alloc_stats, sizeof(pool->alloc_stats));
|
||||
+ /* The caller is responsible to initialize stats. */
|
||||
+ stats->alloc_stats.fast += pool->alloc_stats.fast;
|
||||
+ stats->alloc_stats.slow += pool->alloc_stats.slow;
|
||||
+ stats->alloc_stats.slow_high_order += pool->alloc_stats.slow_high_order;
|
||||
+ stats->alloc_stats.empty += pool->alloc_stats.empty;
|
||||
+ stats->alloc_stats.refill += pool->alloc_stats.refill;
|
||||
+ stats->alloc_stats.waive += pool->alloc_stats.waive;
|
||||
|
||||
for_each_possible_cpu(cpu) {
|
||||
const struct page_pool_recycle_stats *pcpu =
|
||||
@@ -66,6 +87,46 @@ bool page_pool_get_stats(struct page_poo
|
||||
return true;
|
||||
}
|
||||
EXPORT_SYMBOL(page_pool_get_stats);
|
||||
+
|
||||
+u8 *page_pool_ethtool_stats_get_strings(u8 *data)
|
||||
+{
|
||||
+ int i;
|
||||
+
|
||||
+ for (i = 0; i < ARRAY_SIZE(pp_stats); i++) {
|
||||
+ memcpy(data, pp_stats[i], ETH_GSTRING_LEN);
|
||||
+ data += ETH_GSTRING_LEN;
|
||||
+ }
|
||||
+
|
||||
+ return data;
|
||||
+}
|
||||
+EXPORT_SYMBOL(page_pool_ethtool_stats_get_strings);
|
||||
+
|
||||
+int page_pool_ethtool_stats_get_count(void)
|
||||
+{
|
||||
+ return ARRAY_SIZE(pp_stats);
|
||||
+}
|
||||
+EXPORT_SYMBOL(page_pool_ethtool_stats_get_count);
|
||||
+
|
||||
+u64 *page_pool_ethtool_stats_get(u64 *data, void *stats)
|
||||
+{
|
||||
+ struct page_pool_stats *pool_stats = stats;
|
||||
+
|
||||
+ *data++ = pool_stats->alloc_stats.fast;
|
||||
+ *data++ = pool_stats->alloc_stats.slow;
|
||||
+ *data++ = pool_stats->alloc_stats.slow_high_order;
|
||||
+ *data++ = pool_stats->alloc_stats.empty;
|
||||
+ *data++ = pool_stats->alloc_stats.refill;
|
||||
+ *data++ = pool_stats->alloc_stats.waive;
|
||||
+ *data++ = pool_stats->recycle_stats.cached;
|
||||
+ *data++ = pool_stats->recycle_stats.cache_full;
|
||||
+ *data++ = pool_stats->recycle_stats.ring;
|
||||
+ *data++ = pool_stats->recycle_stats.ring_full;
|
||||
+ *data++ = pool_stats->recycle_stats.released_refcnt;
|
||||
+
|
||||
+ return data;
|
||||
+}
|
||||
+EXPORT_SYMBOL(page_pool_ethtool_stats_get);
|
||||
+
|
||||
#else
|
||||
#define alloc_stat_inc(pool, __stat)
|
||||
#define recycle_stat_inc(pool, __stat)
|
@ -0,0 +1,99 @@
|
||||
From 2e88d4ff03013937028f5397268b21e10cf68713 Mon Sep 17 00:00:00 2001
|
||||
From: Lorenzo Bianconi <lorenzo@kernel.org>
|
||||
Date: Fri, 21 Jan 2022 11:09:45 +0100
|
||||
Subject: [PATCH] xdp: introduce flags field in xdp_buff/xdp_frame
|
||||
|
||||
Introduce flags field in xdp_frame and xdp_buffer data structures
|
||||
to define additional buffer features. At the moment the only
|
||||
supported buffer feature is frags bit (XDP_FLAGS_HAS_FRAGS).
|
||||
frags bit is used to specify if this is a linear buffer
|
||||
(XDP_FLAGS_HAS_FRAGS not set) or a frags frame (XDP_FLAGS_HAS_FRAGS
|
||||
set). In the latter case the driver is expected to initialize the
|
||||
skb_shared_info structure at the end of the first buffer to link together
|
||||
subsequent buffers belonging to the same frame.
|
||||
|
||||
Acked-by: Toke Hoiland-Jorgensen <toke@redhat.com>
|
||||
Acked-by: John Fastabend <john.fastabend@gmail.com>
|
||||
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
|
||||
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
|
||||
Link: https://lore.kernel.org/r/e389f14f3a162c0a5bc6a2e1aa8dd01a90be117d.1642758637.git.lorenzo@kernel.org
|
||||
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
|
||||
---
|
||||
include/net/xdp.h | 29 +++++++++++++++++++++++++++++
|
||||
1 file changed, 29 insertions(+)
|
||||
|
||||
--- a/include/net/xdp.h
|
||||
+++ b/include/net/xdp.h
|
||||
@@ -66,6 +66,10 @@ struct xdp_txq_info {
|
||||
struct net_device *dev;
|
||||
};
|
||||
|
||||
+enum xdp_buff_flags {
|
||||
+ XDP_FLAGS_HAS_FRAGS = BIT(0), /* non-linear xdp buff */
|
||||
+};
|
||||
+
|
||||
struct xdp_buff {
|
||||
void *data;
|
||||
void *data_end;
|
||||
@@ -74,13 +78,30 @@ struct xdp_buff {
|
||||
struct xdp_rxq_info *rxq;
|
||||
struct xdp_txq_info *txq;
|
||||
u32 frame_sz; /* frame size to deduce data_hard_end/reserved tailroom*/
|
||||
+ u32 flags; /* supported values defined in xdp_buff_flags */
|
||||
};
|
||||
|
||||
+static __always_inline bool xdp_buff_has_frags(struct xdp_buff *xdp)
|
||||
+{
|
||||
+ return !!(xdp->flags & XDP_FLAGS_HAS_FRAGS);
|
||||
+}
|
||||
+
|
||||
+static __always_inline void xdp_buff_set_frags_flag(struct xdp_buff *xdp)
|
||||
+{
|
||||
+ xdp->flags |= XDP_FLAGS_HAS_FRAGS;
|
||||
+}
|
||||
+
|
||||
+static __always_inline void xdp_buff_clear_frags_flag(struct xdp_buff *xdp)
|
||||
+{
|
||||
+ xdp->flags &= ~XDP_FLAGS_HAS_FRAGS;
|
||||
+}
|
||||
+
|
||||
static __always_inline void
|
||||
xdp_init_buff(struct xdp_buff *xdp, u32 frame_sz, struct xdp_rxq_info *rxq)
|
||||
{
|
||||
xdp->frame_sz = frame_sz;
|
||||
xdp->rxq = rxq;
|
||||
+ xdp->flags = 0;
|
||||
}
|
||||
|
||||
static __always_inline void
|
||||
@@ -122,8 +143,14 @@ struct xdp_frame {
|
||||
*/
|
||||
struct xdp_mem_info mem;
|
||||
struct net_device *dev_rx; /* used by cpumap */
|
||||
+ u32 flags; /* supported values defined in xdp_buff_flags */
|
||||
};
|
||||
|
||||
+static __always_inline bool xdp_frame_has_frags(struct xdp_frame *frame)
|
||||
+{
|
||||
+ return !!(frame->flags & XDP_FLAGS_HAS_FRAGS);
|
||||
+}
|
||||
+
|
||||
#define XDP_BULK_QUEUE_SIZE 16
|
||||
struct xdp_frame_bulk {
|
||||
int count;
|
||||
@@ -180,6 +207,7 @@ void xdp_convert_frame_to_buff(struct xd
|
||||
xdp->data_end = frame->data + frame->len;
|
||||
xdp->data_meta = frame->data - frame->metasize;
|
||||
xdp->frame_sz = frame->frame_sz;
|
||||
+ xdp->flags = frame->flags;
|
||||
}
|
||||
|
||||
static inline
|
||||
@@ -206,6 +234,7 @@ int xdp_update_frame_from_buff(struct xd
|
||||
xdp_frame->headroom = headroom - sizeof(*xdp_frame);
|
||||
xdp_frame->metasize = metasize;
|
||||
xdp_frame->frame_sz = xdp->frame_sz;
|
||||
+ xdp_frame->flags = xdp->flags;
|
||||
|
||||
return 0;
|
||||
}
|
@ -0,0 +1,137 @@
|
||||
From 7c48cb0176c6d6d3b55029f7ff4ffa05faee6446 Mon Sep 17 00:00:00 2001
|
||||
From: Lorenzo Bianconi <lorenzo@kernel.org>
|
||||
Date: Fri, 21 Jan 2022 11:09:50 +0100
|
||||
Subject: [PATCH] xdp: add frags support to xdp_return_{buff/frame}
|
||||
|
||||
Take into account if the received xdp_buff/xdp_frame is non-linear
|
||||
recycling/returning the frame memory to the allocator or into
|
||||
xdp_frame_bulk.
|
||||
|
||||
Acked-by: Toke Hoiland-Jorgensen <toke@redhat.com>
|
||||
Acked-by: John Fastabend <john.fastabend@gmail.com>
|
||||
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
|
||||
Link: https://lore.kernel.org/r/a961069febc868508ce1bdf5e53a343eb4e57cb2.1642758637.git.lorenzo@kernel.org
|
||||
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
|
||||
---
|
||||
include/net/xdp.h | 18 ++++++++++++++--
|
||||
net/core/xdp.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++-
|
||||
2 files changed, 69 insertions(+), 3 deletions(-)
|
||||
|
||||
--- a/include/net/xdp.h
|
||||
+++ b/include/net/xdp.h
|
||||
@@ -275,10 +275,24 @@ void __xdp_release_frame(void *data, str
|
||||
static inline void xdp_release_frame(struct xdp_frame *xdpf)
|
||||
{
|
||||
struct xdp_mem_info *mem = &xdpf->mem;
|
||||
+ struct skb_shared_info *sinfo;
|
||||
+ int i;
|
||||
|
||||
/* Curr only page_pool needs this */
|
||||
- if (mem->type == MEM_TYPE_PAGE_POOL)
|
||||
- __xdp_release_frame(xdpf->data, mem);
|
||||
+ if (mem->type != MEM_TYPE_PAGE_POOL)
|
||||
+ return;
|
||||
+
|
||||
+ if (likely(!xdp_frame_has_frags(xdpf)))
|
||||
+ goto out;
|
||||
+
|
||||
+ sinfo = xdp_get_shared_info_from_frame(xdpf);
|
||||
+ for (i = 0; i < sinfo->nr_frags; i++) {
|
||||
+ struct page *page = skb_frag_page(&sinfo->frags[i]);
|
||||
+
|
||||
+ __xdp_release_frame(page_address(page), mem);
|
||||
+ }
|
||||
+out:
|
||||
+ __xdp_release_frame(xdpf->data, mem);
|
||||
}
|
||||
|
||||
int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq,
|
||||
--- a/net/core/xdp.c
|
||||
+++ b/net/core/xdp.c
|
||||
@@ -376,12 +376,38 @@ static void __xdp_return(void *data, str
|
||||
|
||||
void xdp_return_frame(struct xdp_frame *xdpf)
|
||||
{
|
||||
+ struct skb_shared_info *sinfo;
|
||||
+ int i;
|
||||
+
|
||||
+ if (likely(!xdp_frame_has_frags(xdpf)))
|
||||
+ goto out;
|
||||
+
|
||||
+ sinfo = xdp_get_shared_info_from_frame(xdpf);
|
||||
+ for (i = 0; i < sinfo->nr_frags; i++) {
|
||||
+ struct page *page = skb_frag_page(&sinfo->frags[i]);
|
||||
+
|
||||
+ __xdp_return(page_address(page), &xdpf->mem, false, NULL);
|
||||
+ }
|
||||
+out:
|
||||
__xdp_return(xdpf->data, &xdpf->mem, false, NULL);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(xdp_return_frame);
|
||||
|
||||
void xdp_return_frame_rx_napi(struct xdp_frame *xdpf)
|
||||
{
|
||||
+ struct skb_shared_info *sinfo;
|
||||
+ int i;
|
||||
+
|
||||
+ if (likely(!xdp_frame_has_frags(xdpf)))
|
||||
+ goto out;
|
||||
+
|
||||
+ sinfo = xdp_get_shared_info_from_frame(xdpf);
|
||||
+ for (i = 0; i < sinfo->nr_frags; i++) {
|
||||
+ struct page *page = skb_frag_page(&sinfo->frags[i]);
|
||||
+
|
||||
+ __xdp_return(page_address(page), &xdpf->mem, true, NULL);
|
||||
+ }
|
||||
+out:
|
||||
__xdp_return(xdpf->data, &xdpf->mem, true, NULL);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(xdp_return_frame_rx_napi);
|
||||
@@ -417,7 +443,7 @@ void xdp_return_frame_bulk(struct xdp_fr
|
||||
struct xdp_mem_allocator *xa;
|
||||
|
||||
if (mem->type != MEM_TYPE_PAGE_POOL) {
|
||||
- __xdp_return(xdpf->data, &xdpf->mem, false, NULL);
|
||||
+ xdp_return_frame(xdpf);
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -436,12 +462,38 @@ void xdp_return_frame_bulk(struct xdp_fr
|
||||
bq->xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params);
|
||||
}
|
||||
|
||||
+ if (unlikely(xdp_frame_has_frags(xdpf))) {
|
||||
+ struct skb_shared_info *sinfo;
|
||||
+ int i;
|
||||
+
|
||||
+ sinfo = xdp_get_shared_info_from_frame(xdpf);
|
||||
+ for (i = 0; i < sinfo->nr_frags; i++) {
|
||||
+ skb_frag_t *frag = &sinfo->frags[i];
|
||||
+
|
||||
+ bq->q[bq->count++] = skb_frag_address(frag);
|
||||
+ if (bq->count == XDP_BULK_QUEUE_SIZE)
|
||||
+ xdp_flush_frame_bulk(bq);
|
||||
+ }
|
||||
+ }
|
||||
bq->q[bq->count++] = xdpf->data;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(xdp_return_frame_bulk);
|
||||
|
||||
void xdp_return_buff(struct xdp_buff *xdp)
|
||||
{
|
||||
+ struct skb_shared_info *sinfo;
|
||||
+ int i;
|
||||
+
|
||||
+ if (likely(!xdp_buff_has_frags(xdp)))
|
||||
+ goto out;
|
||||
+
|
||||
+ sinfo = xdp_get_shared_info_from_buff(xdp);
|
||||
+ for (i = 0; i < sinfo->nr_frags; i++) {
|
||||
+ struct page *page = skb_frag_page(&sinfo->frags[i]);
|
||||
+
|
||||
+ __xdp_return(page_address(page), &xdp->rxq->mem, true, xdp);
|
||||
+ }
|
||||
+out:
|
||||
__xdp_return(xdp->data, &xdp->rxq->mem, true, xdp);
|
||||
}
|
||||
|
@ -0,0 +1,31 @@
|
||||
From d16697cb6261d4cc23422e6b1cb2759df8aa76d0 Mon Sep 17 00:00:00 2001
|
||||
From: Lorenzo Bianconi <lorenzo@kernel.org>
|
||||
Date: Fri, 21 Jan 2022 11:09:44 +0100
|
||||
Subject: [PATCH] net: skbuff: add size metadata to skb_shared_info for xdp
|
||||
|
||||
Introduce xdp_frags_size field in skb_shared_info data structure
|
||||
to store xdp_buff/xdp_frame frame paged size (xdp_frags_size will
|
||||
be used in xdp frags support). In order to not increase
|
||||
skb_shared_info size we will use a hole due to skb_shared_info
|
||||
alignment.
|
||||
|
||||
Acked-by: Toke Hoiland-Jorgensen <toke@redhat.com>
|
||||
Acked-by: John Fastabend <john.fastabend@gmail.com>
|
||||
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
|
||||
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
|
||||
Link: https://lore.kernel.org/r/8a849819a3e0a143d540f78a3a5add76e17e980d.1642758637.git.lorenzo@kernel.org
|
||||
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
|
||||
---
|
||||
include/linux/skbuff.h | 1 +
|
||||
1 file changed, 1 insertion(+)
|
||||
|
||||
--- a/include/linux/skbuff.h
|
||||
+++ b/include/linux/skbuff.h
|
||||
@@ -568,6 +568,7 @@ struct skb_shared_info {
|
||||
* Warning : all fields before dataref are cleared in __alloc_skb()
|
||||
*/
|
||||
atomic_t dataref;
|
||||
+ unsigned int xdp_frags_size;
|
||||
|
||||
/* Intermediate layers must ensure that destructor_arg
|
||||
* remains valid until skb destructor */
|
@ -0,0 +1,65 @@
|
||||
From 5142239a22219921a7863cf00c9ab853c00689d8 Mon Sep 17 00:00:00 2001
|
||||
From: Lorenzo Bianconi <lorenzo@kernel.org>
|
||||
Date: Fri, 11 Mar 2022 10:14:18 +0100
|
||||
Subject: [PATCH] net: veth: Account total xdp_frame len running ndo_xdp_xmit
|
||||
|
||||
Even if this is a theoretical issue since it is not possible to perform
|
||||
XDP_REDIRECT on a non-linear xdp_frame, veth driver does not account
|
||||
paged area in ndo_xdp_xmit function pointer.
|
||||
Introduce xdp_get_frame_len utility routine to get the xdp_frame full
|
||||
length and account total frame size running XDP_REDIRECT of a
|
||||
non-linear xdp frame into a veth device.
|
||||
|
||||
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
|
||||
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
|
||||
Acked-by: Toke Hoiland-Jorgensen <toke@redhat.com>
|
||||
Acked-by: John Fastabend <john.fastabend@gmail.com>
|
||||
Link: https://lore.kernel.org/bpf/54f9fd3bb65d190daf2c0bbae2f852ff16cfbaa0.1646989407.git.lorenzo@kernel.org
|
||||
---
|
||||
drivers/net/veth.c | 4 ++--
|
||||
include/net/xdp.h | 14 ++++++++++++++
|
||||
2 files changed, 16 insertions(+), 2 deletions(-)
|
||||
|
||||
--- a/drivers/net/veth.c
|
||||
+++ b/drivers/net/veth.c
|
||||
@@ -501,7 +501,7 @@ static int veth_xdp_xmit(struct net_devi
|
||||
struct xdp_frame *frame = frames[i];
|
||||
void *ptr = veth_xdp_to_ptr(frame);
|
||||
|
||||
- if (unlikely(frame->len > max_len ||
|
||||
+ if (unlikely(xdp_get_frame_len(frame) > max_len ||
|
||||
__ptr_ring_produce(&rq->xdp_ring, ptr)))
|
||||
break;
|
||||
nxmit++;
|
||||
@@ -862,7 +862,7 @@ static int veth_xdp_rcv(struct veth_rq *
|
||||
/* ndo_xdp_xmit */
|
||||
struct xdp_frame *frame = veth_ptr_to_xdp(ptr);
|
||||
|
||||
- stats->xdp_bytes += frame->len;
|
||||
+ stats->xdp_bytes += xdp_get_frame_len(frame);
|
||||
frame = veth_xdp_rcv_one(rq, frame, bq, stats);
|
||||
if (frame) {
|
||||
/* XDP_PASS */
|
||||
--- a/include/net/xdp.h
|
||||
+++ b/include/net/xdp.h
|
||||
@@ -295,6 +295,20 @@ out:
|
||||
__xdp_release_frame(xdpf->data, mem);
|
||||
}
|
||||
|
||||
+static __always_inline unsigned int xdp_get_frame_len(struct xdp_frame *xdpf)
|
||||
+{
|
||||
+ struct skb_shared_info *sinfo;
|
||||
+ unsigned int len = xdpf->len;
|
||||
+
|
||||
+ if (likely(!xdp_frame_has_frags(xdpf)))
|
||||
+ goto out;
|
||||
+
|
||||
+ sinfo = xdp_get_shared_info_from_frame(xdpf);
|
||||
+ len += sinfo->xdp_frags_size;
|
||||
+out:
|
||||
+ return len;
|
||||
+}
|
||||
+
|
||||
int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq,
|
||||
struct net_device *dev, u32 queue_index, unsigned int napi_id);
|
||||
void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq);
|
@ -0,0 +1,40 @@
|
||||
From 7cda76d858a4e71ac4a04066c093679a12e1312c Mon Sep 17 00:00:00 2001
|
||||
From: Lorenzo Bianconi <lorenzo@kernel.org>
|
||||
Date: Fri, 11 Mar 2022 10:14:20 +0100
|
||||
Subject: [PATCH] veth: Allow jumbo frames in xdp mode
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
Allow increasing the MTU over page boundaries on veth devices
|
||||
if the attached xdp program declares to support xdp fragments.
|
||||
|
||||
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
|
||||
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
|
||||
Acked-by: Toke Høiland-Jørgensen <toke@redhat.com>
|
||||
Acked-by: John Fastabend <john.fastabend@gmail.com>
|
||||
Link: https://lore.kernel.org/bpf/d5dc039c3d4123426e7023a488c449181a7bc57f.1646989407.git.lorenzo@kernel.org
|
||||
---
|
||||
drivers/net/veth.c | 11 ++++++++---
|
||||
1 file changed, 8 insertions(+), 3 deletions(-)
|
||||
|
||||
--- a/drivers/net/veth.c
|
||||
+++ b/drivers/net/veth.c
|
||||
@@ -1471,9 +1471,14 @@ static int veth_xdp_set(struct net_devic
|
||||
goto err;
|
||||
}
|
||||
|
||||
- max_mtu = PAGE_SIZE - VETH_XDP_HEADROOM -
|
||||
- peer->hard_header_len -
|
||||
- SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
|
||||
+ max_mtu = SKB_WITH_OVERHEAD(PAGE_SIZE - VETH_XDP_HEADROOM) -
|
||||
+ peer->hard_header_len;
|
||||
+ /* Allow increasing the max_mtu if the program supports
|
||||
+ * XDP fragments.
|
||||
+ */
|
||||
+ //if (prog->aux->xdp_has_frags)
|
||||
+ max_mtu += PAGE_SIZE * MAX_SKB_FRAGS;
|
||||
+
|
||||
if (peer->mtu > max_mtu) {
|
||||
NL_SET_ERR_MSG_MOD(extack, "Peer MTU is too large to set XDP");
|
||||
err = -ERANGE;
|
@ -0,0 +1,56 @@
|
||||
From: Qingfang DENG <qingfang.deng@siflower.com.cn>
|
||||
Date: Fri, 3 Feb 2023 09:16:11 +0800
|
||||
Subject: [PATCH] net: page_pool: use in_softirq() instead
|
||||
|
||||
We use BH context only for synchronization, so we don't care if it's
|
||||
actually serving softirq or not.
|
||||
|
||||
As a side node, in case of threaded NAPI, in_serving_softirq() will
|
||||
return false because it's in process context with BH off, making
|
||||
page_pool_recycle_in_cache() unreachable.
|
||||
|
||||
Signed-off-by: Qingfang DENG <qingfang.deng@siflower.com.cn>
|
||||
---
|
||||
|
||||
--- a/include/net/page_pool.h
|
||||
+++ b/include/net/page_pool.h
|
||||
@@ -357,7 +357,7 @@ static inline void page_pool_nid_changed
|
||||
static inline void page_pool_ring_lock(struct page_pool *pool)
|
||||
__acquires(&pool->ring.producer_lock)
|
||||
{
|
||||
- if (in_serving_softirq())
|
||||
+ if (in_softirq())
|
||||
spin_lock(&pool->ring.producer_lock);
|
||||
else
|
||||
spin_lock_bh(&pool->ring.producer_lock);
|
||||
@@ -366,7 +366,7 @@ static inline void page_pool_ring_lock(s
|
||||
static inline void page_pool_ring_unlock(struct page_pool *pool)
|
||||
__releases(&pool->ring.producer_lock)
|
||||
{
|
||||
- if (in_serving_softirq())
|
||||
+ if (in_softirq())
|
||||
spin_unlock(&pool->ring.producer_lock);
|
||||
else
|
||||
spin_unlock_bh(&pool->ring.producer_lock);
|
||||
--- a/net/core/page_pool.c
|
||||
+++ b/net/core/page_pool.c
|
||||
@@ -512,8 +512,8 @@ static void page_pool_return_page(struct
|
||||
static bool page_pool_recycle_in_ring(struct page_pool *pool, struct page *page)
|
||||
{
|
||||
int ret;
|
||||
- /* BH protection not needed if current is serving softirq */
|
||||
- if (in_serving_softirq())
|
||||
+ /* BH protection not needed if current is softirq */
|
||||
+ if (in_softirq())
|
||||
ret = ptr_ring_produce(&pool->ring, page);
|
||||
else
|
||||
ret = ptr_ring_produce_bh(&pool->ring, page);
|
||||
@@ -576,7 +576,7 @@ __page_pool_put_page(struct page_pool *p
|
||||
page_pool_dma_sync_for_device(pool, page,
|
||||
dma_sync_size);
|
||||
|
||||
- if (allow_direct && in_serving_softirq() &&
|
||||
+ if (allow_direct && in_softirq() &&
|
||||
page_pool_recycle_in_cache(page, pool))
|
||||
return NULL;
|
||||
|
@ -0,0 +1,41 @@
|
||||
From 7390609b0121a1b982c5ecdfcd72dc328e5784ee Mon Sep 17 00:00:00 2001
|
||||
From: Michael Walle <michael@walle.cc>
|
||||
Date: Mon, 6 Feb 2023 13:43:42 +0000
|
||||
Subject: [PATCH] net: add helper eth_addr_add()
|
||||
|
||||
Add a helper to add an offset to a ethernet address. This comes in handy
|
||||
if you have a base ethernet address for multiple interfaces.
|
||||
|
||||
Signed-off-by: Michael Walle <michael@walle.cc>
|
||||
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
|
||||
Acked-by: Jakub Kicinski <kuba@kernel.org>
|
||||
Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
|
||||
Link: https://lore.kernel.org/r/20230206134356.839737-9-srinivas.kandagatla@linaro.org
|
||||
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
|
||||
---
|
||||
include/linux/etherdevice.h | 14 ++++++++++++++
|
||||
1 file changed, 14 insertions(+)
|
||||
|
||||
--- a/include/linux/etherdevice.h
|
||||
+++ b/include/linux/etherdevice.h
|
||||
@@ -478,6 +478,20 @@ static inline void eth_addr_inc(u8 *addr
|
||||
}
|
||||
|
||||
/**
|
||||
+ * eth_addr_add() - Add (or subtract) an offset to/from the given MAC address.
|
||||
+ *
|
||||
+ * @offset: Offset to add.
|
||||
+ * @addr: Pointer to a six-byte array containing Ethernet address to increment.
|
||||
+ */
|
||||
+static inline void eth_addr_add(u8 *addr, long offset)
|
||||
+{
|
||||
+ u64 u = ether_addr_to_u64(addr);
|
||||
+
|
||||
+ u += offset;
|
||||
+ u64_to_ether_addr(u, addr);
|
||||
+}
|
||||
+
|
||||
+/**
|
||||
* is_etherdev_addr - Tell if given Ethernet address belongs to the device.
|
||||
* @dev: Pointer to a device structure
|
||||
* @addr: Pointer to a six-byte array containing the Ethernet address
|
@ -0,0 +1,279 @@
|
||||
From dc452a471dbae8aca8257c565174212620880093 Mon Sep 17 00:00:00 2001
|
||||
From: Vladimir Oltean <vladimir.oltean@nxp.com>
|
||||
Date: Fri, 10 Dec 2021 01:34:37 +0200
|
||||
Subject: net: dsa: introduce tagger-owned storage for private and shared data
|
||||
|
||||
Ansuel is working on register access over Ethernet for the qca8k switch
|
||||
family. This requires the qca8k tagging protocol driver to receive
|
||||
frames which aren't intended for the network stack, but instead for the
|
||||
qca8k switch driver itself.
|
||||
|
||||
The dp->priv is currently the prevailing method for passing data back
|
||||
and forth between the tagging protocol driver and the switch driver.
|
||||
However, this method is riddled with caveats.
|
||||
|
||||
The DSA design allows in principle for any switch driver to return any
|
||||
protocol it desires in ->get_tag_protocol(). The dsa_loop driver can be
|
||||
modified to do just that. But in the current design, the memory behind
|
||||
dp->priv has to be allocated by the switch driver, so if the tagging
|
||||
protocol is paired to an unexpected switch driver, we may end up in NULL
|
||||
pointer dereferences inside the kernel, or worse (a switch driver may
|
||||
allocate dp->priv according to the expectations of a different tagger).
|
||||
|
||||
The latter possibility is even more plausible considering that DSA
|
||||
switches can dynamically change tagging protocols in certain cases
|
||||
(dsa <-> edsa, ocelot <-> ocelot-8021q), and the current design lends
|
||||
itself to mistakes that are all too easy to make.
|
||||
|
||||
This patch proposes that the tagging protocol driver should manage its
|
||||
own memory, instead of relying on the switch driver to do so.
|
||||
After analyzing the different in-tree needs, it can be observed that the
|
||||
required tagger storage is per switch, therefore a ds->tagger_data
|
||||
pointer is introduced. In principle, per-port storage could also be
|
||||
introduced, although there is no need for it at the moment. Future
|
||||
changes will replace the current usage of dp->priv with ds->tagger_data.
|
||||
|
||||
We define a "binding" event between the DSA switch tree and the tagging
|
||||
protocol. During this binding event, the tagging protocol's ->connect()
|
||||
method is called first, and this may allocate some memory for each
|
||||
switch of the tree. Then a cross-chip notifier is emitted for the
|
||||
switches within that tree, and they are given the opportunity to fix up
|
||||
the tagger's memory (for example, they might set up some function
|
||||
pointers that represent virtual methods for consuming packets).
|
||||
Because the memory is owned by the tagger, there exists a ->disconnect()
|
||||
method for the tagger (which is the place to free the resources), but
|
||||
there doesn't exist a ->disconnect() method for the switch driver.
|
||||
This is part of the design. The switch driver should make minimal use of
|
||||
the public part of the tagger data, and only after type-checking it
|
||||
using the supplied "proto" argument.
|
||||
|
||||
In the code there are in fact two binding events, one is the initial
|
||||
event in dsa_switch_setup_tag_protocol(). At this stage, the cross chip
|
||||
notifier chains aren't initialized, so we call each switch's connect()
|
||||
method by hand. Then there is dsa_tree_bind_tag_proto() during
|
||||
dsa_tree_change_tag_proto(), and here we have an old protocol and a new
|
||||
one. We first connect to the new one before disconnecting from the old
|
||||
one, to simplify error handling a bit and to ensure we remain in a valid
|
||||
state at all times.
|
||||
|
||||
Co-developed-by: Ansuel Smith <ansuelsmth@gmail.com>
|
||||
Signed-off-by: Ansuel Smith <ansuelsmth@gmail.com>
|
||||
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
|
||||
Signed-off-by: David S. Miller <davem@davemloft.net>
|
||||
---
|
||||
include/net/dsa.h | 12 +++++++++
|
||||
net/dsa/dsa2.c | 73 +++++++++++++++++++++++++++++++++++++++++++++++++++---
|
||||
net/dsa/dsa_priv.h | 1 +
|
||||
net/dsa/switch.c | 14 +++++++++++
|
||||
4 files changed, 96 insertions(+), 4 deletions(-)
|
||||
|
||||
--- a/include/net/dsa.h
|
||||
+++ b/include/net/dsa.h
|
||||
@@ -80,12 +80,15 @@ enum dsa_tag_protocol {
|
||||
};
|
||||
|
||||
struct dsa_switch;
|
||||
+struct dsa_switch_tree;
|
||||
|
||||
struct dsa_device_ops {
|
||||
struct sk_buff *(*xmit)(struct sk_buff *skb, struct net_device *dev);
|
||||
struct sk_buff *(*rcv)(struct sk_buff *skb, struct net_device *dev);
|
||||
void (*flow_dissect)(const struct sk_buff *skb, __be16 *proto,
|
||||
int *offset);
|
||||
+ int (*connect)(struct dsa_switch_tree *dst);
|
||||
+ void (*disconnect)(struct dsa_switch_tree *dst);
|
||||
unsigned int needed_headroom;
|
||||
unsigned int needed_tailroom;
|
||||
const char *name;
|
||||
@@ -329,6 +332,8 @@ struct dsa_switch {
|
||||
*/
|
||||
void *priv;
|
||||
|
||||
+ void *tagger_data;
|
||||
+
|
||||
/*
|
||||
* Configuration data for this switch.
|
||||
*/
|
||||
@@ -584,6 +589,13 @@ struct dsa_switch_ops {
|
||||
enum dsa_tag_protocol mprot);
|
||||
int (*change_tag_protocol)(struct dsa_switch *ds, int port,
|
||||
enum dsa_tag_protocol proto);
|
||||
+ /*
|
||||
+ * Method for switch drivers to connect to the tagging protocol driver
|
||||
+ * in current use. The switch driver can provide handlers for certain
|
||||
+ * types of packets for switch management.
|
||||
+ */
|
||||
+ int (*connect_tag_protocol)(struct dsa_switch *ds,
|
||||
+ enum dsa_tag_protocol proto);
|
||||
|
||||
/* Optional switch-wide initialization and destruction methods */
|
||||
int (*setup)(struct dsa_switch *ds);
|
||||
--- a/net/dsa/dsa2.c
|
||||
+++ b/net/dsa/dsa2.c
|
||||
@@ -230,8 +230,12 @@ static struct dsa_switch_tree *dsa_tree_
|
||||
|
||||
static void dsa_tree_free(struct dsa_switch_tree *dst)
|
||||
{
|
||||
- if (dst->tag_ops)
|
||||
+ if (dst->tag_ops) {
|
||||
+ if (dst->tag_ops->disconnect)
|
||||
+ dst->tag_ops->disconnect(dst);
|
||||
+
|
||||
dsa_tag_driver_put(dst->tag_ops);
|
||||
+ }
|
||||
list_del(&dst->list);
|
||||
kfree(dst);
|
||||
}
|
||||
@@ -805,7 +809,7 @@ static int dsa_switch_setup_tag_protocol
|
||||
int port, err;
|
||||
|
||||
if (tag_ops->proto == dst->default_proto)
|
||||
- return 0;
|
||||
+ goto connect;
|
||||
|
||||
for (port = 0; port < ds->num_ports; port++) {
|
||||
if (!dsa_is_cpu_port(ds, port))
|
||||
@@ -821,6 +825,17 @@ static int dsa_switch_setup_tag_protocol
|
||||
}
|
||||
}
|
||||
|
||||
+connect:
|
||||
+ if (ds->ops->connect_tag_protocol) {
|
||||
+ err = ds->ops->connect_tag_protocol(ds, tag_ops->proto);
|
||||
+ if (err) {
|
||||
+ dev_err(ds->dev,
|
||||
+ "Unable to connect to tag protocol \"%s\": %pe\n",
|
||||
+ tag_ops->name, ERR_PTR(err));
|
||||
+ return err;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -1132,6 +1147,46 @@ static void dsa_tree_teardown(struct dsa
|
||||
dst->setup = false;
|
||||
}
|
||||
|
||||
+static int dsa_tree_bind_tag_proto(struct dsa_switch_tree *dst,
|
||||
+ const struct dsa_device_ops *tag_ops)
|
||||
+{
|
||||
+ const struct dsa_device_ops *old_tag_ops = dst->tag_ops;
|
||||
+ struct dsa_notifier_tag_proto_info info;
|
||||
+ int err;
|
||||
+
|
||||
+ dst->tag_ops = tag_ops;
|
||||
+
|
||||
+ /* Notify the new tagger about the connection to this tree */
|
||||
+ if (tag_ops->connect) {
|
||||
+ err = tag_ops->connect(dst);
|
||||
+ if (err)
|
||||
+ goto out_revert;
|
||||
+ }
|
||||
+
|
||||
+ /* Notify the switches from this tree about the connection
|
||||
+ * to the new tagger
|
||||
+ */
|
||||
+ info.tag_ops = tag_ops;
|
||||
+ err = dsa_tree_notify(dst, DSA_NOTIFIER_TAG_PROTO_CONNECT, &info);
|
||||
+ if (err && err != -EOPNOTSUPP)
|
||||
+ goto out_disconnect;
|
||||
+
|
||||
+ /* Notify the old tagger about the disconnection from this tree */
|
||||
+ if (old_tag_ops->disconnect)
|
||||
+ old_tag_ops->disconnect(dst);
|
||||
+
|
||||
+ return 0;
|
||||
+
|
||||
+out_disconnect:
|
||||
+ /* Revert the new tagger's connection to this tree */
|
||||
+ if (tag_ops->disconnect)
|
||||
+ tag_ops->disconnect(dst);
|
||||
+out_revert:
|
||||
+ dst->tag_ops = old_tag_ops;
|
||||
+
|
||||
+ return err;
|
||||
+}
|
||||
+
|
||||
/* Since the dsa/tagging sysfs device attribute is per master, the assumption
|
||||
* is that all DSA switches within a tree share the same tagger, otherwise
|
||||
* they would have formed disjoint trees (different "dsa,member" values).
|
||||
@@ -1164,12 +1219,15 @@ int dsa_tree_change_tag_proto(struct dsa
|
||||
goto out_unlock;
|
||||
}
|
||||
|
||||
+ /* Notify the tag protocol change */
|
||||
info.tag_ops = tag_ops;
|
||||
err = dsa_tree_notify(dst, DSA_NOTIFIER_TAG_PROTO, &info);
|
||||
if (err)
|
||||
- goto out_unwind_tagger;
|
||||
+ return err;
|
||||
|
||||
- dst->tag_ops = tag_ops;
|
||||
+ err = dsa_tree_bind_tag_proto(dst, tag_ops);
|
||||
+ if (err)
|
||||
+ goto out_unwind_tagger;
|
||||
|
||||
rtnl_unlock();
|
||||
|
||||
@@ -1257,6 +1315,7 @@ static int dsa_port_parse_cpu(struct dsa
|
||||
struct dsa_switch *ds = dp->ds;
|
||||
struct dsa_switch_tree *dst = ds->dst;
|
||||
enum dsa_tag_protocol default_proto;
|
||||
+ int err;
|
||||
|
||||
/* Find out which protocol the switch would prefer. */
|
||||
default_proto = dsa_get_tag_protocol(dp, master);
|
||||
@@ -1311,6 +1370,12 @@ static int dsa_port_parse_cpu(struct dsa
|
||||
*/
|
||||
dsa_tag_driver_put(tag_ops);
|
||||
} else {
|
||||
+ if (tag_ops->connect) {
|
||||
+ err = tag_ops->connect(dst);
|
||||
+ if (err)
|
||||
+ return err;
|
||||
+ }
|
||||
+
|
||||
dst->tag_ops = tag_ops;
|
||||
}
|
||||
|
||||
--- a/net/dsa/dsa_priv.h
|
||||
+++ b/net/dsa/dsa_priv.h
|
||||
@@ -37,6 +37,7 @@ enum {
|
||||
DSA_NOTIFIER_VLAN_DEL,
|
||||
DSA_NOTIFIER_MTU,
|
||||
DSA_NOTIFIER_TAG_PROTO,
|
||||
+ DSA_NOTIFIER_TAG_PROTO_CONNECT,
|
||||
DSA_NOTIFIER_MRP_ADD,
|
||||
DSA_NOTIFIER_MRP_DEL,
|
||||
DSA_NOTIFIER_MRP_ADD_RING_ROLE,
|
||||
--- a/net/dsa/switch.c
|
||||
+++ b/net/dsa/switch.c
|
||||
@@ -616,6 +616,17 @@ static int dsa_switch_change_tag_proto(s
|
||||
return 0;
|
||||
}
|
||||
|
||||
+static int dsa_switch_connect_tag_proto(struct dsa_switch *ds,
|
||||
+ struct dsa_notifier_tag_proto_info *info)
|
||||
+{
|
||||
+ const struct dsa_device_ops *tag_ops = info->tag_ops;
|
||||
+
|
||||
+ if (!ds->ops->connect_tag_protocol)
|
||||
+ return -EOPNOTSUPP;
|
||||
+
|
||||
+ return ds->ops->connect_tag_protocol(ds, tag_ops->proto);
|
||||
+}
|
||||
+
|
||||
static int dsa_switch_mrp_add(struct dsa_switch *ds,
|
||||
struct dsa_notifier_mrp_info *info)
|
||||
{
|
||||
@@ -735,6 +746,9 @@ static int dsa_switch_event(struct notif
|
||||
case DSA_NOTIFIER_TAG_PROTO:
|
||||
err = dsa_switch_change_tag_proto(ds, info);
|
||||
break;
|
||||
+ case DSA_NOTIFIER_TAG_PROTO_CONNECT:
|
||||
+ err = dsa_switch_connect_tag_proto(ds, info);
|
||||
+ break;
|
||||
case DSA_NOTIFIER_MRP_ADD:
|
||||
err = dsa_switch_mrp_add(ds, info);
|
||||
break;
|
@ -0,0 +1,274 @@
|
||||
From 7f2973149c22e7a6fee4c0c9fa6b8e4108e9c208 Mon Sep 17 00:00:00 2001
|
||||
From: Vladimir Oltean <vladimir.oltean@nxp.com>
|
||||
Date: Tue, 14 Dec 2021 03:45:36 +0200
|
||||
Subject: net: dsa: make tagging protocols connect to individual switches from
|
||||
a tree
|
||||
|
||||
On the NXP Bluebox 3 board which uses a multi-switch setup with sja1105,
|
||||
the mechanism through which the tagger connects to the switch tree is
|
||||
broken, due to improper DSA code design. At the time when tag_ops->connect()
|
||||
is called in dsa_port_parse_cpu(), DSA hasn't finished "touching" all
|
||||
the ports, so it doesn't know how large the tree is and how many ports
|
||||
it has. It has just seen the first CPU port by this time. As a result,
|
||||
this function will call the tagger's ->connect method too early, and the
|
||||
tagger will connect only to the first switch from the tree.
|
||||
|
||||
This could be perhaps addressed a bit more simply by just moving the
|
||||
tag_ops->connect(dst) call a bit later (for example in dsa_tree_setup),
|
||||
but there is already a design inconsistency at present: on the switch
|
||||
side, the notification is on a per-switch basis, but on the tagger side,
|
||||
it is on a per-tree basis. Furthermore, the persistent storage itself is
|
||||
per switch (ds->tagger_data). And the tagger connect and disconnect
|
||||
procedures (at least the ones that exist currently) could see a fair bit
|
||||
of simplification if they didn't have to iterate through the switches of
|
||||
a tree.
|
||||
|
||||
To fix the issue, this change transforms tag_ops->connect(dst) into
|
||||
tag_ops->connect(ds) and moves it somewhere where we already iterate
|
||||
over all switches of a tree. That is in dsa_switch_setup_tag_protocol(),
|
||||
which is a good placement because we already have there the connection
|
||||
call to the switch side of things.
|
||||
|
||||
As for the dsa_tree_bind_tag_proto() method (called from the code path
|
||||
that changes the tag protocol), things are a bit more complicated
|
||||
because we receive the tree as argument, yet when we unwind on errors,
|
||||
it would be nice to not call tag_ops->disconnect(ds) where we didn't
|
||||
previously call tag_ops->connect(ds). We didn't have this problem before
|
||||
because the tag_ops connection operations passed the entire dst before,
|
||||
and this is more fine grained now. To solve the error rewind case using
|
||||
the new API, we have to create yet one more cross-chip notifier for
|
||||
disconnection, and stay connected with the old tag protocol to all the
|
||||
switches in the tree until we've succeeded to connect with the new one
|
||||
as well. So if something fails half way, the whole tree is still
|
||||
connected to the old tagger. But there may still be leaks if the tagger
|
||||
fails to connect to the 2nd out of 3 switches in a tree: somebody needs
|
||||
to tell the tagger to disconnect from the first switch. Nothing comes
|
||||
for free, and this was previously handled privately by the tagging
|
||||
protocol driver before, but now we need to emit a disconnect cross-chip
|
||||
notifier for that, because DSA has to take care of the unwind path. We
|
||||
assume that the tagging protocol has connected to a switch if it has set
|
||||
ds->tagger_data to something, otherwise we avoid calling its
|
||||
disconnection method in the error rewind path.
|
||||
|
||||
The rest of the changes are in the tagging protocol drivers, and have to
|
||||
do with the replacement of dst with ds. The iteration is removed and the
|
||||
error unwind path is simplified, as mentioned above.
|
||||
|
||||
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
|
||||
Signed-off-by: David S. Miller <davem@davemloft.net>
|
||||
---
|
||||
include/net/dsa.h | 5 ++--
|
||||
net/dsa/dsa2.c | 44 +++++++++++++-----------------
|
||||
net/dsa/dsa_priv.h | 1 +
|
||||
net/dsa/switch.c | 52 ++++++++++++++++++++++++++++++++---
|
||||
net/dsa/tag_ocelot_8021q.c | 53 +++++++++++-------------------------
|
||||
net/dsa/tag_sja1105.c | 67 ++++++++++++++++------------------------------
|
||||
6 files changed, 109 insertions(+), 113 deletions(-)
|
||||
|
||||
--- a/include/net/dsa.h
|
||||
+++ b/include/net/dsa.h
|
||||
@@ -80,15 +80,14 @@ enum dsa_tag_protocol {
|
||||
};
|
||||
|
||||
struct dsa_switch;
|
||||
-struct dsa_switch_tree;
|
||||
|
||||
struct dsa_device_ops {
|
||||
struct sk_buff *(*xmit)(struct sk_buff *skb, struct net_device *dev);
|
||||
struct sk_buff *(*rcv)(struct sk_buff *skb, struct net_device *dev);
|
||||
void (*flow_dissect)(const struct sk_buff *skb, __be16 *proto,
|
||||
int *offset);
|
||||
- int (*connect)(struct dsa_switch_tree *dst);
|
||||
- void (*disconnect)(struct dsa_switch_tree *dst);
|
||||
+ int (*connect)(struct dsa_switch *ds);
|
||||
+ void (*disconnect)(struct dsa_switch *ds);
|
||||
unsigned int needed_headroom;
|
||||
unsigned int needed_tailroom;
|
||||
const char *name;
|
||||
--- a/net/dsa/dsa2.c
|
||||
+++ b/net/dsa/dsa2.c
|
||||
@@ -230,12 +230,8 @@ static struct dsa_switch_tree *dsa_tree_
|
||||
|
||||
static void dsa_tree_free(struct dsa_switch_tree *dst)
|
||||
{
|
||||
- if (dst->tag_ops) {
|
||||
- if (dst->tag_ops->disconnect)
|
||||
- dst->tag_ops->disconnect(dst);
|
||||
-
|
||||
+ if (dst->tag_ops)
|
||||
dsa_tag_driver_put(dst->tag_ops);
|
||||
- }
|
||||
list_del(&dst->list);
|
||||
kfree(dst);
|
||||
}
|
||||
@@ -826,17 +822,29 @@ static int dsa_switch_setup_tag_protocol
|
||||
}
|
||||
|
||||
connect:
|
||||
+ if (tag_ops->connect) {
|
||||
+ err = tag_ops->connect(ds);
|
||||
+ if (err)
|
||||
+ return err;
|
||||
+ }
|
||||
+
|
||||
if (ds->ops->connect_tag_protocol) {
|
||||
err = ds->ops->connect_tag_protocol(ds, tag_ops->proto);
|
||||
if (err) {
|
||||
dev_err(ds->dev,
|
||||
"Unable to connect to tag protocol \"%s\": %pe\n",
|
||||
tag_ops->name, ERR_PTR(err));
|
||||
- return err;
|
||||
+ goto disconnect;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
+
|
||||
+disconnect:
|
||||
+ if (tag_ops->disconnect)
|
||||
+ tag_ops->disconnect(ds);
|
||||
+
|
||||
+ return err;
|
||||
}
|
||||
|
||||
static int dsa_switch_setup(struct dsa_switch *ds)
|
||||
@@ -1156,13 +1164,6 @@ static int dsa_tree_bind_tag_proto(struc
|
||||
|
||||
dst->tag_ops = tag_ops;
|
||||
|
||||
- /* Notify the new tagger about the connection to this tree */
|
||||
- if (tag_ops->connect) {
|
||||
- err = tag_ops->connect(dst);
|
||||
- if (err)
|
||||
- goto out_revert;
|
||||
- }
|
||||
-
|
||||
/* Notify the switches from this tree about the connection
|
||||
* to the new tagger
|
||||
*/
|
||||
@@ -1172,16 +1173,14 @@ static int dsa_tree_bind_tag_proto(struc
|
||||
goto out_disconnect;
|
||||
|
||||
/* Notify the old tagger about the disconnection from this tree */
|
||||
- if (old_tag_ops->disconnect)
|
||||
- old_tag_ops->disconnect(dst);
|
||||
+ info.tag_ops = old_tag_ops;
|
||||
+ dsa_tree_notify(dst, DSA_NOTIFIER_TAG_PROTO_DISCONNECT, &info);
|
||||
|
||||
return 0;
|
||||
|
||||
out_disconnect:
|
||||
- /* Revert the new tagger's connection to this tree */
|
||||
- if (tag_ops->disconnect)
|
||||
- tag_ops->disconnect(dst);
|
||||
-out_revert:
|
||||
+ info.tag_ops = tag_ops;
|
||||
+ dsa_tree_notify(dst, DSA_NOTIFIER_TAG_PROTO_DISCONNECT, &info);
|
||||
dst->tag_ops = old_tag_ops;
|
||||
|
||||
return err;
|
||||
@@ -1315,7 +1314,6 @@ static int dsa_port_parse_cpu(struct dsa
|
||||
struct dsa_switch *ds = dp->ds;
|
||||
struct dsa_switch_tree *dst = ds->dst;
|
||||
enum dsa_tag_protocol default_proto;
|
||||
- int err;
|
||||
|
||||
/* Find out which protocol the switch would prefer. */
|
||||
default_proto = dsa_get_tag_protocol(dp, master);
|
||||
@@ -1370,12 +1368,6 @@ static int dsa_port_parse_cpu(struct dsa
|
||||
*/
|
||||
dsa_tag_driver_put(tag_ops);
|
||||
} else {
|
||||
- if (tag_ops->connect) {
|
||||
- err = tag_ops->connect(dst);
|
||||
- if (err)
|
||||
- return err;
|
||||
- }
|
||||
-
|
||||
dst->tag_ops = tag_ops;
|
||||
}
|
||||
|
||||
--- a/net/dsa/dsa_priv.h
|
||||
+++ b/net/dsa/dsa_priv.h
|
||||
@@ -38,6 +38,7 @@ enum {
|
||||
DSA_NOTIFIER_MTU,
|
||||
DSA_NOTIFIER_TAG_PROTO,
|
||||
DSA_NOTIFIER_TAG_PROTO_CONNECT,
|
||||
+ DSA_NOTIFIER_TAG_PROTO_DISCONNECT,
|
||||
DSA_NOTIFIER_MRP_ADD,
|
||||
DSA_NOTIFIER_MRP_DEL,
|
||||
DSA_NOTIFIER_MRP_ADD_RING_ROLE,
|
||||
--- a/net/dsa/switch.c
|
||||
+++ b/net/dsa/switch.c
|
||||
@@ -616,15 +616,58 @@ static int dsa_switch_change_tag_proto(s
|
||||
return 0;
|
||||
}
|
||||
|
||||
-static int dsa_switch_connect_tag_proto(struct dsa_switch *ds,
|
||||
- struct dsa_notifier_tag_proto_info *info)
|
||||
+/* We use the same cross-chip notifiers to inform both the tagger side, as well
|
||||
+ * as the switch side, of connection and disconnection events.
|
||||
+ * Since ds->tagger_data is owned by the tagger, it isn't a hard error if the
|
||||
+ * switch side doesn't support connecting to this tagger, and therefore, the
|
||||
+ * fact that we don't disconnect the tagger side doesn't constitute a memory
|
||||
+ * leak: the tagger will still operate with persistent per-switch memory, just
|
||||
+ * with the switch side unconnected to it. What does constitute a hard error is
|
||||
+ * when the switch side supports connecting but fails.
|
||||
+ */
|
||||
+static int
|
||||
+dsa_switch_connect_tag_proto(struct dsa_switch *ds,
|
||||
+ struct dsa_notifier_tag_proto_info *info)
|
||||
{
|
||||
const struct dsa_device_ops *tag_ops = info->tag_ops;
|
||||
+ int err;
|
||||
+
|
||||
+ /* Notify the new tagger about the connection to this switch */
|
||||
+ if (tag_ops->connect) {
|
||||
+ err = tag_ops->connect(ds);
|
||||
+ if (err)
|
||||
+ return err;
|
||||
+ }
|
||||
|
||||
if (!ds->ops->connect_tag_protocol)
|
||||
return -EOPNOTSUPP;
|
||||
|
||||
- return ds->ops->connect_tag_protocol(ds, tag_ops->proto);
|
||||
+ /* Notify the switch about the connection to the new tagger */
|
||||
+ err = ds->ops->connect_tag_protocol(ds, tag_ops->proto);
|
||||
+ if (err) {
|
||||
+ /* Revert the new tagger's connection to this tree */
|
||||
+ if (tag_ops->disconnect)
|
||||
+ tag_ops->disconnect(ds);
|
||||
+ return err;
|
||||
+ }
|
||||
+
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
+static int
|
||||
+dsa_switch_disconnect_tag_proto(struct dsa_switch *ds,
|
||||
+ struct dsa_notifier_tag_proto_info *info)
|
||||
+{
|
||||
+ const struct dsa_device_ops *tag_ops = info->tag_ops;
|
||||
+
|
||||
+ /* Notify the tagger about the disconnection from this switch */
|
||||
+ if (tag_ops->disconnect && ds->tagger_data)
|
||||
+ tag_ops->disconnect(ds);
|
||||
+
|
||||
+ /* No need to notify the switch, since it shouldn't have any
|
||||
+ * resources to tear down
|
||||
+ */
|
||||
+ return 0;
|
||||
}
|
||||
|
||||
static int dsa_switch_mrp_add(struct dsa_switch *ds,
|
||||
@@ -749,6 +792,9 @@ static int dsa_switch_event(struct notif
|
||||
case DSA_NOTIFIER_TAG_PROTO_CONNECT:
|
||||
err = dsa_switch_connect_tag_proto(ds, info);
|
||||
break;
|
||||
+ case DSA_NOTIFIER_TAG_PROTO_DISCONNECT:
|
||||
+ err = dsa_switch_disconnect_tag_proto(ds, info);
|
||||
+ break;
|
||||
case DSA_NOTIFIER_MRP_ADD:
|
||||
err = dsa_switch_mrp_add(ds, info);
|
||||
break;
|
@ -0,0 +1,327 @@
|
||||
From: Felix Fietkau <nbd@nbd.name>
|
||||
Date: Sat, 5 Feb 2022 17:59:07 +0100
|
||||
Subject: [PATCH] net: ethernet: mtk_eth_soc: add support for coherent
|
||||
DMA
|
||||
|
||||
It improves performance by eliminating the need for a cache flush on rx and tx
|
||||
In preparation for supporting WED (Wireless Ethernet Dispatch), also add a
|
||||
function for disabling coherent DMA at runtime.
|
||||
|
||||
Signed-off-by: Felix Fietkau <nbd@nbd.name>
|
||||
---
|
||||
|
||||
--- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c
|
||||
+++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
|
||||
@@ -9,6 +9,7 @@
|
||||
#include <linux/of_device.h>
|
||||
#include <linux/of_mdio.h>
|
||||
#include <linux/of_net.h>
|
||||
+#include <linux/of_address.h>
|
||||
#include <linux/mfd/syscon.h>
|
||||
#include <linux/regmap.h>
|
||||
#include <linux/clk.h>
|
||||
@@ -840,7 +841,7 @@ static int mtk_init_fq_dma(struct mtk_et
|
||||
dma_addr_t dma_addr;
|
||||
int i;
|
||||
|
||||
- eth->scratch_ring = dma_alloc_coherent(eth->dev,
|
||||
+ eth->scratch_ring = dma_alloc_coherent(eth->dma_dev,
|
||||
cnt * sizeof(struct mtk_tx_dma),
|
||||
ð->phy_scratch_ring,
|
||||
GFP_ATOMIC);
|
||||
@@ -852,10 +853,10 @@ static int mtk_init_fq_dma(struct mtk_et
|
||||
if (unlikely(!eth->scratch_head))
|
||||
return -ENOMEM;
|
||||
|
||||
- dma_addr = dma_map_single(eth->dev,
|
||||
+ dma_addr = dma_map_single(eth->dma_dev,
|
||||
eth->scratch_head, cnt * MTK_QDMA_PAGE_SIZE,
|
||||
DMA_FROM_DEVICE);
|
||||
- if (unlikely(dma_mapping_error(eth->dev, dma_addr)))
|
||||
+ if (unlikely(dma_mapping_error(eth->dma_dev, dma_addr)))
|
||||
return -ENOMEM;
|
||||
|
||||
phy_ring_tail = eth->phy_scratch_ring +
|
||||
@@ -909,26 +910,26 @@ static void mtk_tx_unmap(struct mtk_eth
|
||||
{
|
||||
if (MTK_HAS_CAPS(eth->soc->caps, MTK_QDMA)) {
|
||||
if (tx_buf->flags & MTK_TX_FLAGS_SINGLE0) {
|
||||
- dma_unmap_single(eth->dev,
|
||||
+ dma_unmap_single(eth->dma_dev,
|
||||
dma_unmap_addr(tx_buf, dma_addr0),
|
||||
dma_unmap_len(tx_buf, dma_len0),
|
||||
DMA_TO_DEVICE);
|
||||
} else if (tx_buf->flags & MTK_TX_FLAGS_PAGE0) {
|
||||
- dma_unmap_page(eth->dev,
|
||||
+ dma_unmap_page(eth->dma_dev,
|
||||
dma_unmap_addr(tx_buf, dma_addr0),
|
||||
dma_unmap_len(tx_buf, dma_len0),
|
||||
DMA_TO_DEVICE);
|
||||
}
|
||||
} else {
|
||||
if (dma_unmap_len(tx_buf, dma_len0)) {
|
||||
- dma_unmap_page(eth->dev,
|
||||
+ dma_unmap_page(eth->dma_dev,
|
||||
dma_unmap_addr(tx_buf, dma_addr0),
|
||||
dma_unmap_len(tx_buf, dma_len0),
|
||||
DMA_TO_DEVICE);
|
||||
}
|
||||
|
||||
if (dma_unmap_len(tx_buf, dma_len1)) {
|
||||
- dma_unmap_page(eth->dev,
|
||||
+ dma_unmap_page(eth->dma_dev,
|
||||
dma_unmap_addr(tx_buf, dma_addr1),
|
||||
dma_unmap_len(tx_buf, dma_len1),
|
||||
DMA_TO_DEVICE);
|
||||
@@ -1006,9 +1007,9 @@ static int mtk_tx_map(struct sk_buff *sk
|
||||
if (skb_vlan_tag_present(skb))
|
||||
txd4 |= TX_DMA_INS_VLAN | skb_vlan_tag_get(skb);
|
||||
|
||||
- mapped_addr = dma_map_single(eth->dev, skb->data,
|
||||
+ mapped_addr = dma_map_single(eth->dma_dev, skb->data,
|
||||
skb_headlen(skb), DMA_TO_DEVICE);
|
||||
- if (unlikely(dma_mapping_error(eth->dev, mapped_addr)))
|
||||
+ if (unlikely(dma_mapping_error(eth->dma_dev, mapped_addr)))
|
||||
return -ENOMEM;
|
||||
|
||||
WRITE_ONCE(itxd->txd1, mapped_addr);
|
||||
@@ -1047,10 +1048,10 @@ static int mtk_tx_map(struct sk_buff *sk
|
||||
|
||||
|
||||
frag_map_size = min(frag_size, MTK_TX_DMA_BUF_LEN);
|
||||
- mapped_addr = skb_frag_dma_map(eth->dev, frag, offset,
|
||||
+ mapped_addr = skb_frag_dma_map(eth->dma_dev, frag, offset,
|
||||
frag_map_size,
|
||||
DMA_TO_DEVICE);
|
||||
- if (unlikely(dma_mapping_error(eth->dev, mapped_addr)))
|
||||
+ if (unlikely(dma_mapping_error(eth->dma_dev, mapped_addr)))
|
||||
goto err_dma;
|
||||
|
||||
if (i == nr_frags - 1 &&
|
||||
@@ -1331,18 +1332,18 @@ static int mtk_poll_rx(struct napi_struc
|
||||
netdev->stats.rx_dropped++;
|
||||
goto release_desc;
|
||||
}
|
||||
- dma_addr = dma_map_single(eth->dev,
|
||||
+ dma_addr = dma_map_single(eth->dma_dev,
|
||||
new_data + NET_SKB_PAD +
|
||||
eth->ip_align,
|
||||
ring->buf_size,
|
||||
DMA_FROM_DEVICE);
|
||||
- if (unlikely(dma_mapping_error(eth->dev, dma_addr))) {
|
||||
+ if (unlikely(dma_mapping_error(eth->dma_dev, dma_addr))) {
|
||||
skb_free_frag(new_data);
|
||||
netdev->stats.rx_dropped++;
|
||||
goto release_desc;
|
||||
}
|
||||
|
||||
- dma_unmap_single(eth->dev, trxd.rxd1,
|
||||
+ dma_unmap_single(eth->dma_dev, trxd.rxd1,
|
||||
ring->buf_size, DMA_FROM_DEVICE);
|
||||
|
||||
/* receive data */
|
||||
@@ -1615,7 +1616,7 @@ static int mtk_tx_alloc(struct mtk_eth *
|
||||
if (!ring->buf)
|
||||
goto no_tx_mem;
|
||||
|
||||
- ring->dma = dma_alloc_coherent(eth->dev, MTK_DMA_SIZE * sz,
|
||||
+ ring->dma = dma_alloc_coherent(eth->dma_dev, MTK_DMA_SIZE * sz,
|
||||
&ring->phys, GFP_ATOMIC);
|
||||
if (!ring->dma)
|
||||
goto no_tx_mem;
|
||||
@@ -1633,7 +1634,7 @@ static int mtk_tx_alloc(struct mtk_eth *
|
||||
* descriptors in ring->dma_pdma.
|
||||
*/
|
||||
if (!MTK_HAS_CAPS(eth->soc->caps, MTK_QDMA)) {
|
||||
- ring->dma_pdma = dma_alloc_coherent(eth->dev, MTK_DMA_SIZE * sz,
|
||||
+ ring->dma_pdma = dma_alloc_coherent(eth->dma_dev, MTK_DMA_SIZE * sz,
|
||||
&ring->phys_pdma,
|
||||
GFP_ATOMIC);
|
||||
if (!ring->dma_pdma)
|
||||
@@ -1692,7 +1693,7 @@ static void mtk_tx_clean(struct mtk_eth
|
||||
}
|
||||
|
||||
if (ring->dma) {
|
||||
- dma_free_coherent(eth->dev,
|
||||
+ dma_free_coherent(eth->dma_dev,
|
||||
MTK_DMA_SIZE * sizeof(*ring->dma),
|
||||
ring->dma,
|
||||
ring->phys);
|
||||
@@ -1700,7 +1701,7 @@ static void mtk_tx_clean(struct mtk_eth
|
||||
}
|
||||
|
||||
if (ring->dma_pdma) {
|
||||
- dma_free_coherent(eth->dev,
|
||||
+ dma_free_coherent(eth->dma_dev,
|
||||
MTK_DMA_SIZE * sizeof(*ring->dma_pdma),
|
||||
ring->dma_pdma,
|
||||
ring->phys_pdma);
|
||||
@@ -1748,18 +1749,18 @@ static int mtk_rx_alloc(struct mtk_eth *
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
- ring->dma = dma_alloc_coherent(eth->dev,
|
||||
+ ring->dma = dma_alloc_coherent(eth->dma_dev,
|
||||
rx_dma_size * sizeof(*ring->dma),
|
||||
&ring->phys, GFP_ATOMIC);
|
||||
if (!ring->dma)
|
||||
return -ENOMEM;
|
||||
|
||||
for (i = 0; i < rx_dma_size; i++) {
|
||||
- dma_addr_t dma_addr = dma_map_single(eth->dev,
|
||||
+ dma_addr_t dma_addr = dma_map_single(eth->dma_dev,
|
||||
ring->data[i] + NET_SKB_PAD + eth->ip_align,
|
||||
ring->buf_size,
|
||||
DMA_FROM_DEVICE);
|
||||
- if (unlikely(dma_mapping_error(eth->dev, dma_addr)))
|
||||
+ if (unlikely(dma_mapping_error(eth->dma_dev, dma_addr)))
|
||||
return -ENOMEM;
|
||||
ring->dma[i].rxd1 = (unsigned int)dma_addr;
|
||||
|
||||
@@ -1795,7 +1796,7 @@ static void mtk_rx_clean(struct mtk_eth
|
||||
continue;
|
||||
if (!ring->dma[i].rxd1)
|
||||
continue;
|
||||
- dma_unmap_single(eth->dev,
|
||||
+ dma_unmap_single(eth->dma_dev,
|
||||
ring->dma[i].rxd1,
|
||||
ring->buf_size,
|
||||
DMA_FROM_DEVICE);
|
||||
@@ -1806,7 +1807,7 @@ static void mtk_rx_clean(struct mtk_eth
|
||||
}
|
||||
|
||||
if (ring->dma) {
|
||||
- dma_free_coherent(eth->dev,
|
||||
+ dma_free_coherent(eth->dma_dev,
|
||||
ring->dma_size * sizeof(*ring->dma),
|
||||
ring->dma,
|
||||
ring->phys);
|
||||
@@ -2162,7 +2163,7 @@ static void mtk_dma_free(struct mtk_eth
|
||||
if (eth->netdev[i])
|
||||
netdev_reset_queue(eth->netdev[i]);
|
||||
if (eth->scratch_ring) {
|
||||
- dma_free_coherent(eth->dev,
|
||||
+ dma_free_coherent(eth->dma_dev,
|
||||
MTK_DMA_SIZE * sizeof(struct mtk_tx_dma),
|
||||
eth->scratch_ring,
|
||||
eth->phy_scratch_ring);
|
||||
@@ -2514,6 +2515,8 @@ static void mtk_dim_tx(struct work_struc
|
||||
|
||||
static int mtk_hw_init(struct mtk_eth *eth)
|
||||
{
|
||||
+ u32 dma_mask = ETHSYS_DMA_AG_MAP_PDMA | ETHSYS_DMA_AG_MAP_QDMA |
|
||||
+ ETHSYS_DMA_AG_MAP_PPE;
|
||||
int i, val, ret;
|
||||
|
||||
if (test_and_set_bit(MTK_HW_INIT, ð->state))
|
||||
@@ -2526,6 +2529,10 @@ static int mtk_hw_init(struct mtk_eth *e
|
||||
if (ret)
|
||||
goto err_disable_pm;
|
||||
|
||||
+ if (eth->ethsys)
|
||||
+ regmap_update_bits(eth->ethsys, ETHSYS_DMA_AG_MAP, dma_mask,
|
||||
+ of_dma_is_coherent(eth->dma_dev->of_node) * dma_mask);
|
||||
+
|
||||
if (MTK_HAS_CAPS(eth->soc->caps, MTK_SOC_MT7628)) {
|
||||
ret = device_reset(eth->dev);
|
||||
if (ret) {
|
||||
@@ -3079,6 +3086,35 @@ free_netdev:
|
||||
return err;
|
||||
}
|
||||
|
||||
+void mtk_eth_set_dma_device(struct mtk_eth *eth, struct device *dma_dev)
|
||||
+{
|
||||
+ struct net_device *dev, *tmp;
|
||||
+ LIST_HEAD(dev_list);
|
||||
+ int i;
|
||||
+
|
||||
+ rtnl_lock();
|
||||
+
|
||||
+ for (i = 0; i < MTK_MAC_COUNT; i++) {
|
||||
+ dev = eth->netdev[i];
|
||||
+
|
||||
+ if (!dev || !(dev->flags & IFF_UP))
|
||||
+ continue;
|
||||
+
|
||||
+ list_add_tail(&dev->close_list, &dev_list);
|
||||
+ }
|
||||
+
|
||||
+ dev_close_many(&dev_list, false);
|
||||
+
|
||||
+ eth->dma_dev = dma_dev;
|
||||
+
|
||||
+ list_for_each_entry_safe(dev, tmp, &dev_list, close_list) {
|
||||
+ list_del_init(&dev->close_list);
|
||||
+ dev_open(dev, NULL);
|
||||
+ }
|
||||
+
|
||||
+ rtnl_unlock();
|
||||
+}
|
||||
+
|
||||
static int mtk_probe(struct platform_device *pdev)
|
||||
{
|
||||
struct device_node *mac_np;
|
||||
@@ -3092,6 +3128,7 @@ static int mtk_probe(struct platform_dev
|
||||
eth->soc = of_device_get_match_data(&pdev->dev);
|
||||
|
||||
eth->dev = &pdev->dev;
|
||||
+ eth->dma_dev = &pdev->dev;
|
||||
eth->base = devm_platform_ioremap_resource(pdev, 0);
|
||||
if (IS_ERR(eth->base))
|
||||
return PTR_ERR(eth->base);
|
||||
@@ -3140,6 +3177,16 @@ static int mtk_probe(struct platform_dev
|
||||
}
|
||||
}
|
||||
|
||||
+ if (of_dma_is_coherent(pdev->dev.of_node)) {
|
||||
+ struct regmap *cci;
|
||||
+
|
||||
+ cci = syscon_regmap_lookup_by_phandle(pdev->dev.of_node,
|
||||
+ "mediatek,cci-control");
|
||||
+ /* enable CPU/bus coherency */
|
||||
+ if (!IS_ERR(cci))
|
||||
+ regmap_write(cci, 0, 3);
|
||||
+ }
|
||||
+
|
||||
if (MTK_HAS_CAPS(eth->soc->caps, MTK_SGMII)) {
|
||||
eth->sgmii = devm_kzalloc(eth->dev, sizeof(*eth->sgmii),
|
||||
GFP_KERNEL);
|
||||
--- a/drivers/net/ethernet/mediatek/mtk_eth_soc.h
|
||||
+++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.h
|
||||
@@ -463,6 +463,12 @@
|
||||
#define RSTCTRL_FE BIT(6)
|
||||
#define RSTCTRL_PPE BIT(31)
|
||||
|
||||
+/* ethernet dma channel agent map */
|
||||
+#define ETHSYS_DMA_AG_MAP 0x408
|
||||
+#define ETHSYS_DMA_AG_MAP_PDMA BIT(0)
|
||||
+#define ETHSYS_DMA_AG_MAP_QDMA BIT(1)
|
||||
+#define ETHSYS_DMA_AG_MAP_PPE BIT(2)
|
||||
+
|
||||
/* SGMII subsystem config registers */
|
||||
/* Register to auto-negotiation restart */
|
||||
#define SGMSYS_PCS_CONTROL_1 0x0
|
||||
@@ -880,6 +886,7 @@ struct mtk_sgmii {
|
||||
/* struct mtk_eth - This is the main datasructure for holding the state
|
||||
* of the driver
|
||||
* @dev: The device pointer
|
||||
+ * @dev: The device pointer used for dma mapping/alloc
|
||||
* @base: The mapped register i/o base
|
||||
* @page_lock: Make sure that register operations are atomic
|
||||
* @tx_irq__lock: Make sure that IRQ register operations are atomic
|
||||
@@ -923,6 +930,7 @@ struct mtk_sgmii {
|
||||
|
||||
struct mtk_eth {
|
||||
struct device *dev;
|
||||
+ struct device *dma_dev;
|
||||
void __iomem *base;
|
||||
spinlock_t page_lock;
|
||||
spinlock_t tx_irq_lock;
|
||||
@@ -1021,6 +1029,7 @@ int mtk_gmac_rgmii_path_setup(struct mtk
|
||||
int mtk_eth_offload_init(struct mtk_eth *eth);
|
||||
int mtk_eth_setup_tc(struct net_device *dev, enum tc_setup_type type,
|
||||
void *type_data);
|
||||
+void mtk_eth_set_dma_device(struct mtk_eth *eth, struct device *dma_dev);
|
||||
|
||||
|
||||
#endif /* MTK_ETH_H */
|
@ -0,0 +1,30 @@
|
||||
From: Felix Fietkau <nbd@nbd.name>
|
||||
Date: Mon, 7 Feb 2022 10:27:22 +0100
|
||||
Subject: [PATCH] arm64: dts: mediatek: mt7622: add support for coherent
|
||||
DMA
|
||||
|
||||
It improves performance by eliminating the need for a cache flush on rx and tx
|
||||
|
||||
Signed-off-by: Felix Fietkau <nbd@nbd.name>
|
||||
---
|
||||
|
||||
--- a/arch/arm64/boot/dts/mediatek/mt7622.dtsi
|
||||
+++ b/arch/arm64/boot/dts/mediatek/mt7622.dtsi
|
||||
@@ -357,7 +357,7 @@
|
||||
};
|
||||
|
||||
cci_control2: slave-if@5000 {
|
||||
- compatible = "arm,cci-400-ctrl-if";
|
||||
+ compatible = "arm,cci-400-ctrl-if", "syscon";
|
||||
interface-type = "ace";
|
||||
reg = <0x5000 0x1000>;
|
||||
};
|
||||
@@ -938,6 +938,8 @@
|
||||
power-domains = <&scpsys MT7622_POWER_DOMAIN_ETHSYS>;
|
||||
mediatek,ethsys = <ðsys>;
|
||||
mediatek,sgmiisys = <&sgmiisys>;
|
||||
+ mediatek,cci-control = <&cci_control2>;
|
||||
+ dma-coherent;
|
||||
#address-cells = <1>;
|
||||
#size-cells = <0>;
|
||||
status = "disabled";
|
File diff suppressed because it is too large
Load Diff
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user