mirror of
https://github.com/openwrt/openwrt.git
synced 2024-12-19 05:38:00 +00:00
generic: copy backport, hack, pending patch and config from 5.15 to 6.1
Copy backport, hack, pending patch and config from 5.15 to 6.1. Signed-off-by: Christian Marangi <ansuelsmth@gmail.com>
This commit is contained in:
parent
8fb9bbcf65
commit
fa79baf4a6
@ -0,0 +1,73 @@
|
||||
From 2fd7e7f9317d3048a14026816d081b08ba98ea8e Mon Sep 17 00:00:00 2001
|
||||
From: Mark Rutland <mark.rutland@arm.com>
|
||||
Date: Tue, 8 Mar 2022 22:56:13 +0100
|
||||
Subject: [PATCH 1/3] Kbuild: use -Wdeclaration-after-statement
|
||||
|
||||
The kernel is moving from using `-std=gnu89` to `-std=gnu11`, permitting
|
||||
the use of additional C11 features such as for-loop initial declarations.
|
||||
|
||||
One contentious aspect of C99 is that it permits mixed declarations and
|
||||
code, and for now at least, it seems preferable to enforce that
|
||||
declarations must come first.
|
||||
|
||||
These warnings were already enabled in the kernel itself, but not
|
||||
for KBUILD_USERCFLAGS or the compat VDSO on arch/arm64, which uses
|
||||
a separate set of CFLAGS.
|
||||
|
||||
This patch fixes an existing violation in modpost.c, which is not
|
||||
reported because of the missing flag in KBUILD_USERCFLAGS:
|
||||
|
||||
| scripts/mod/modpost.c: In function ‘match’:
|
||||
| scripts/mod/modpost.c:837:3: warning: ISO C90 forbids mixed declarations and code [-Wdeclaration-after-statement]
|
||||
| 837 | const char *endp = p + strlen(p) - 1;
|
||||
| | ^~~~~
|
||||
|
||||
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
|
||||
[arnd: don't add a duplicate flag to the default set, update changelog]
|
||||
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
|
||||
Reviewed-by: Nathan Chancellor <nathan@kernel.org>
|
||||
Reviewed-by: Nick Desaulniers <ndesaulniers@google.com>
|
||||
Tested-by: Sedat Dilek <sedat.dilek@gmail.com> # LLVM/Clang v13.0.0 (x86-64)
|
||||
Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
|
||||
---
|
||||
Makefile | 3 ++-
|
||||
arch/arm64/kernel/vdso32/Makefile | 1 +
|
||||
scripts/mod/modpost.c | 4 +++-
|
||||
3 files changed, 6 insertions(+), 2 deletions(-)
|
||||
|
||||
--- a/Makefile
|
||||
+++ b/Makefile
|
||||
@@ -440,7 +440,8 @@ endif
|
||||
HOSTPKG_CONFIG = pkg-config
|
||||
|
||||
export KBUILD_USERCFLAGS := -Wall -Wmissing-prototypes -Wstrict-prototypes \
|
||||
- -O2 -fomit-frame-pointer -std=gnu89
|
||||
+ -O2 -fomit-frame-pointer -std=gnu89 \
|
||||
+ -Wdeclaration-after-statement
|
||||
export KBUILD_USERLDFLAGS :=
|
||||
|
||||
KBUILD_HOSTCFLAGS := $(KBUILD_USERCFLAGS) $(HOST_LFS_CFLAGS) $(HOSTCFLAGS)
|
||||
--- a/arch/arm64/kernel/vdso32/Makefile
|
||||
+++ b/arch/arm64/kernel/vdso32/Makefile
|
||||
@@ -76,6 +76,7 @@ VDSO_CFLAGS += -Wall -Wundef -Wstrict-pr
|
||||
-fno-strict-aliasing -fno-common \
|
||||
-Werror-implicit-function-declaration \
|
||||
-Wno-format-security \
|
||||
+ -Wdeclaration-after-statement \
|
||||
-std=gnu89
|
||||
VDSO_CFLAGS += -O2
|
||||
# Some useful compiler-dependent flags from top-level Makefile
|
||||
--- a/scripts/mod/modpost.c
|
||||
+++ b/scripts/mod/modpost.c
|
||||
@@ -833,8 +833,10 @@ static int match(const char *sym, const
|
||||
{
|
||||
const char *p;
|
||||
while (*pat) {
|
||||
+ const char *endp;
|
||||
+
|
||||
p = *pat++;
|
||||
- const char *endp = p + strlen(p) - 1;
|
||||
+ endp = p + strlen(p) - 1;
|
||||
|
||||
/* "*foo*" */
|
||||
if (*p == '*' && *endp == '*') {
|
@ -0,0 +1,60 @@
|
||||
From b810c8e719ea082e47c7a8f7cf878bc84fa2455d Mon Sep 17 00:00:00 2001
|
||||
From: Arnd Bergmann <arnd@arndb.de>
|
||||
Date: Tue, 8 Mar 2022 22:56:14 +0100
|
||||
Subject: [PATCH 2/3] Kbuild: move to -std=gnu11
|
||||
|
||||
During a patch discussion, Linus brought up the option of changing
|
||||
the C standard version from gnu89 to gnu99, which allows using variable
|
||||
declaration inside of a for() loop. While the C99, C11 and later standards
|
||||
introduce many other features, most of these are already available in
|
||||
gnu89 as GNU extensions as well.
|
||||
|
||||
An earlier attempt to do this when gcc-5 started defaulting to
|
||||
-std=gnu11 failed because at the time that caused warnings about
|
||||
designated initializers with older compilers. Now that gcc-5.1 is
|
||||
the minimum compiler version used for building kernels, that is no
|
||||
longer a concern. Similarly, the behavior of 'inline' functions changes
|
||||
between gnu89 using gnu_inline behavior and gnu11 using standard c99+
|
||||
behavior, but this was taken care of by defining 'inline' to include
|
||||
__attribute__((gnu_inline)) in order to allow building with clang a
|
||||
while ago.
|
||||
|
||||
Nathan Chancellor reported a new -Wdeclaration-after-statement
|
||||
warning that appears in a system header on arm, this still needs a
|
||||
workaround.
|
||||
|
||||
The differences between gnu99, gnu11, gnu1x and gnu17 are fairly
|
||||
minimal and mainly impact warnings at the -Wpedantic level that the
|
||||
kernel never enables. Between these, gnu11 is the newest version
|
||||
that is supported by all supported compiler versions, though it is
|
||||
only the default on gcc-5, while all other supported versions of
|
||||
gcc or clang default to gnu1x/gnu17.
|
||||
|
||||
Link: https://lore.kernel.org/lkml/CAHk-=wiyCH7xeHcmiFJ-YgXUy2Jaj7pnkdKpcovt8fYbVFW3TA@mail.gmail.com/
|
||||
Link: https://github.com/ClangBuiltLinux/linux/issues/1603
|
||||
Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
|
||||
Acked-by: Marco Elver <elver@google.com>
|
||||
Acked-by: Jani Nikula <jani.nikula@intel.com>
|
||||
Acked-by: David Sterba <dsterba@suse.com>
|
||||
Tested-by: Sedat Dilek <sedat.dilek@gmail.com>
|
||||
Reviewed-by: Alex Shi <alexs@kernel.org>
|
||||
Reviewed-by: Nick Desaulniers <ndesaulniers@google.com>
|
||||
Reviewed-by: Miguel Ojeda <ojeda@kernel.org>
|
||||
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
|
||||
Reviewed-by: Nathan Chancellor <nathan@kernel.org>
|
||||
Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
|
||||
---
|
||||
Makefile | 2 +-
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
--- a/Makefile
|
||||
+++ b/Makefile
|
||||
@@ -524,7 +524,7 @@ KBUILD_CFLAGS := -Wall -Wundef -Werror
|
||||
-fno-strict-aliasing -fno-common -fshort-wchar -fno-PIE \
|
||||
-Werror=implicit-function-declaration -Werror=implicit-int \
|
||||
-Werror=return-type -Wno-format-security \
|
||||
- -std=gnu89
|
||||
+ -std=gnu11
|
||||
KBUILD_CPPFLAGS := -D__KERNEL__
|
||||
KBUILD_AFLAGS_KERNEL :=
|
||||
KBUILD_CFLAGS_KERNEL :=
|
@ -0,0 +1,43 @@
|
||||
From 40337d6f3d677aee7ad3052ae662d3f53dd4d5cb Mon Sep 17 00:00:00 2001
|
||||
From: Arnd Bergmann <arnd@arndb.de>
|
||||
Date: Tue, 8 Mar 2022 22:56:15 +0100
|
||||
Subject: [PATCH 3/3] Kbuild: use -std=gnu11 for KBUILD_USERCFLAGS
|
||||
|
||||
As we change the C language standard for the kernel from gnu89 to
|
||||
gnu11, it makes sense to also update the version for user space
|
||||
compilation.
|
||||
|
||||
Some users have older native compilers than what they use for
|
||||
kernel builds, so I considered using gnu99 as the default version
|
||||
for wider compatibility with gcc-4.6 and earlier.
|
||||
|
||||
However, testing with older compilers showed that we already require
|
||||
HOSTCC version 5.1 as well because a lot of host tools include
|
||||
linux/compiler.h that uses __has_attribute():
|
||||
|
||||
CC tools/objtool/exec-cmd.o
|
||||
In file included from tools/include/linux/compiler_types.h:36:0,
|
||||
from tools/include/linux/compiler.h:5,
|
||||
from exec-cmd.c:2:
|
||||
tools/include/linux/compiler-gcc.h:19:5: error: "__has_attribute" is not defined [-Werror=undef]
|
||||
|
||||
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
|
||||
Reviewed-by: Nathan Chancellor <nathan@kernel.org>
|
||||
Reviewed-by: Nick Desaulniers <ndesaulniers@google.com>
|
||||
Tested-by: Sedat Dilek <sedat.dilek@gmail.com>
|
||||
Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
|
||||
---
|
||||
Makefile | 2 +-
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
--- a/Makefile
|
||||
+++ b/Makefile
|
||||
@@ -440,7 +440,7 @@ endif
|
||||
HOSTPKG_CONFIG = pkg-config
|
||||
|
||||
export KBUILD_USERCFLAGS := -Wall -Wmissing-prototypes -Wstrict-prototypes \
|
||||
- -O2 -fomit-frame-pointer -std=gnu89 \
|
||||
+ -O2 -fomit-frame-pointer -std=gnu11 \
|
||||
-Wdeclaration-after-statement
|
||||
export KBUILD_USERLDFLAGS :=
|
||||
|
@ -0,0 +1,425 @@
|
||||
From a4103262b01a1b8704b37c01c7c813df91b7b119 Mon Sep 17 00:00:00 2001
|
||||
From: Yu Zhao <yuzhao@google.com>
|
||||
Date: Sun, 18 Sep 2022 01:59:58 -0600
|
||||
Subject: [PATCH 01/29] mm: x86, arm64: add arch_has_hw_pte_young()
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
Patch series "Multi-Gen LRU Framework", v14.
|
||||
|
||||
What's new
|
||||
==========
|
||||
1. OpenWrt, in addition to Android, Arch Linux Zen, Armbian, ChromeOS,
|
||||
Liquorix, post-factum and XanMod, is now shipping MGLRU on 5.15.
|
||||
2. Fixed long-tailed direct reclaim latency seen on high-memory (TBs)
|
||||
machines. The old direct reclaim backoff, which tries to enforce a
|
||||
minimum fairness among all eligible memcgs, over-swapped by about
|
||||
(total_mem>>DEF_PRIORITY)-nr_to_reclaim. The new backoff, which
|
||||
pulls the plug on swapping once the target is met, trades some
|
||||
fairness for curtailed latency:
|
||||
https://lore.kernel.org/r/20220918080010.2920238-10-yuzhao@google.com/
|
||||
3. Fixed minior build warnings and conflicts. More comments and nits.
|
||||
|
||||
TLDR
|
||||
====
|
||||
The current page reclaim is too expensive in terms of CPU usage and it
|
||||
often makes poor choices about what to evict. This patchset offers an
|
||||
alternative solution that is performant, versatile and
|
||||
straightforward.
|
||||
|
||||
Patchset overview
|
||||
=================
|
||||
The design and implementation overview is in patch 14:
|
||||
https://lore.kernel.org/r/20220918080010.2920238-15-yuzhao@google.com/
|
||||
|
||||
01. mm: x86, arm64: add arch_has_hw_pte_young()
|
||||
02. mm: x86: add CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG
|
||||
Take advantage of hardware features when trying to clear the accessed
|
||||
bit in many PTEs.
|
||||
|
||||
03. mm/vmscan.c: refactor shrink_node()
|
||||
04. Revert "include/linux/mm_inline.h: fold __update_lru_size() into
|
||||
its sole caller"
|
||||
Minor refactors to improve readability for the following patches.
|
||||
|
||||
05. mm: multi-gen LRU: groundwork
|
||||
Adds the basic data structure and the functions that insert pages to
|
||||
and remove pages from the multi-gen LRU (MGLRU) lists.
|
||||
|
||||
06. mm: multi-gen LRU: minimal implementation
|
||||
A minimal implementation without optimizations.
|
||||
|
||||
07. mm: multi-gen LRU: exploit locality in rmap
|
||||
Exploits spatial locality to improve efficiency when using the rmap.
|
||||
|
||||
08. mm: multi-gen LRU: support page table walks
|
||||
Further exploits spatial locality by optionally scanning page tables.
|
||||
|
||||
09. mm: multi-gen LRU: optimize multiple memcgs
|
||||
Optimizes the overall performance for multiple memcgs running mixed
|
||||
types of workloads.
|
||||
|
||||
10. mm: multi-gen LRU: kill switch
|
||||
Adds a kill switch to enable or disable MGLRU at runtime.
|
||||
|
||||
11. mm: multi-gen LRU: thrashing prevention
|
||||
12. mm: multi-gen LRU: debugfs interface
|
||||
Provide userspace with features like thrashing prevention, working set
|
||||
estimation and proactive reclaim.
|
||||
|
||||
13. mm: multi-gen LRU: admin guide
|
||||
14. mm: multi-gen LRU: design doc
|
||||
Add an admin guide and a design doc.
|
||||
|
||||
Benchmark results
|
||||
=================
|
||||
Independent lab results
|
||||
-----------------------
|
||||
Based on the popularity of searches [01] and the memory usage in
|
||||
Google's public cloud, the most popular open-source memory-hungry
|
||||
applications, in alphabetical order, are:
|
||||
Apache Cassandra Memcached
|
||||
Apache Hadoop MongoDB
|
||||
Apache Spark PostgreSQL
|
||||
MariaDB (MySQL) Redis
|
||||
|
||||
An independent lab evaluated MGLRU with the most widely used benchmark
|
||||
suites for the above applications. They posted 960 data points along
|
||||
with kernel metrics and perf profiles collected over more than 500
|
||||
hours of total benchmark time. Their final reports show that, with 95%
|
||||
confidence intervals (CIs), the above applications all performed
|
||||
significantly better for at least part of their benchmark matrices.
|
||||
|
||||
On 5.14:
|
||||
1. Apache Spark [02] took 95% CIs [9.28, 11.19]% and [12.20, 14.93]%
|
||||
less wall time to sort three billion random integers, respectively,
|
||||
under the medium- and the high-concurrency conditions, when
|
||||
overcommitting memory. There were no statistically significant
|
||||
changes in wall time for the rest of the benchmark matrix.
|
||||
2. MariaDB [03] achieved 95% CIs [5.24, 10.71]% and [20.22, 25.97]%
|
||||
more transactions per minute (TPM), respectively, under the medium-
|
||||
and the high-concurrency conditions, when overcommitting memory.
|
||||
There were no statistically significant changes in TPM for the rest
|
||||
of the benchmark matrix.
|
||||
3. Memcached [04] achieved 95% CIs [23.54, 32.25]%, [20.76, 41.61]%
|
||||
and [21.59, 30.02]% more operations per second (OPS), respectively,
|
||||
for sequential access, random access and Gaussian (distribution)
|
||||
access, when THP=always; 95% CIs [13.85, 15.97]% and
|
||||
[23.94, 29.92]% more OPS, respectively, for random access and
|
||||
Gaussian access, when THP=never. There were no statistically
|
||||
significant changes in OPS for the rest of the benchmark matrix.
|
||||
4. MongoDB [05] achieved 95% CIs [2.23, 3.44]%, [6.97, 9.73]% and
|
||||
[2.16, 3.55]% more operations per second (OPS), respectively, for
|
||||
exponential (distribution) access, random access and Zipfian
|
||||
(distribution) access, when underutilizing memory; 95% CIs
|
||||
[8.83, 10.03]%, [21.12, 23.14]% and [5.53, 6.46]% more OPS,
|
||||
respectively, for exponential access, random access and Zipfian
|
||||
access, when overcommitting memory.
|
||||
|
||||
On 5.15:
|
||||
5. Apache Cassandra [06] achieved 95% CIs [1.06, 4.10]%, [1.94, 5.43]%
|
||||
and [4.11, 7.50]% more operations per second (OPS), respectively,
|
||||
for exponential (distribution) access, random access and Zipfian
|
||||
(distribution) access, when swap was off; 95% CIs [0.50, 2.60]%,
|
||||
[6.51, 8.77]% and [3.29, 6.75]% more OPS, respectively, for
|
||||
exponential access, random access and Zipfian access, when swap was
|
||||
on.
|
||||
6. Apache Hadoop [07] took 95% CIs [5.31, 9.69]% and [2.02, 7.86]%
|
||||
less average wall time to finish twelve parallel TeraSort jobs,
|
||||
respectively, under the medium- and the high-concurrency
|
||||
conditions, when swap was on. There were no statistically
|
||||
significant changes in average wall time for the rest of the
|
||||
benchmark matrix.
|
||||
7. PostgreSQL [08] achieved 95% CI [1.75, 6.42]% more transactions per
|
||||
minute (TPM) under the high-concurrency condition, when swap was
|
||||
off; 95% CIs [12.82, 18.69]% and [22.70, 46.86]% more TPM,
|
||||
respectively, under the medium- and the high-concurrency
|
||||
conditions, when swap was on. There were no statistically
|
||||
significant changes in TPM for the rest of the benchmark matrix.
|
||||
8. Redis [09] achieved 95% CIs [0.58, 5.94]%, [6.55, 14.58]% and
|
||||
[11.47, 19.36]% more total operations per second (OPS),
|
||||
respectively, for sequential access, random access and Gaussian
|
||||
(distribution) access, when THP=always; 95% CIs [1.27, 3.54]%,
|
||||
[10.11, 14.81]% and [8.75, 13.64]% more total OPS, respectively,
|
||||
for sequential access, random access and Gaussian access, when
|
||||
THP=never.
|
||||
|
||||
Our lab results
|
||||
---------------
|
||||
To supplement the above results, we ran the following benchmark suites
|
||||
on 5.16-rc7 and found no regressions [10].
|
||||
fs_fio_bench_hdd_mq pft
|
||||
fs_lmbench pgsql-hammerdb
|
||||
fs_parallelio redis
|
||||
fs_postmark stream
|
||||
hackbench sysbenchthread
|
||||
kernbench tpcc_spark
|
||||
memcached unixbench
|
||||
multichase vm-scalability
|
||||
mutilate will-it-scale
|
||||
nginx
|
||||
|
||||
[01] https://trends.google.com
|
||||
[02] https://lore.kernel.org/r/20211102002002.92051-1-bot@edi.works/
|
||||
[03] https://lore.kernel.org/r/20211009054315.47073-1-bot@edi.works/
|
||||
[04] https://lore.kernel.org/r/20211021194103.65648-1-bot@edi.works/
|
||||
[05] https://lore.kernel.org/r/20211109021346.50266-1-bot@edi.works/
|
||||
[06] https://lore.kernel.org/r/20211202062806.80365-1-bot@edi.works/
|
||||
[07] https://lore.kernel.org/r/20211209072416.33606-1-bot@edi.works/
|
||||
[08] https://lore.kernel.org/r/20211218071041.24077-1-bot@edi.works/
|
||||
[09] https://lore.kernel.org/r/20211122053248.57311-1-bot@edi.works/
|
||||
[10] https://lore.kernel.org/r/20220104202247.2903702-1-yuzhao@google.com/
|
||||
|
||||
Read-world applications
|
||||
=======================
|
||||
Third-party testimonials
|
||||
------------------------
|
||||
Konstantin reported [11]:
|
||||
I have Archlinux with 8G RAM + zswap + swap. While developing, I
|
||||
have lots of apps opened such as multiple LSP-servers for different
|
||||
langs, chats, two browsers, etc... Usually, my system gets quickly
|
||||
to a point of SWAP-storms, where I have to kill LSP-servers,
|
||||
restart browsers to free memory, etc, otherwise the system lags
|
||||
heavily and is barely usable.
|
||||
|
||||
1.5 day ago I migrated from 5.11.15 kernel to 5.12 + the LRU
|
||||
patchset, and I started up by opening lots of apps to create memory
|
||||
pressure, and worked for a day like this. Till now I had not a
|
||||
single SWAP-storm, and mind you I got 3.4G in SWAP. I was never
|
||||
getting to the point of 3G in SWAP before without a single
|
||||
SWAP-storm.
|
||||
|
||||
Vaibhav from IBM reported [12]:
|
||||
In a synthetic MongoDB Benchmark, seeing an average of ~19%
|
||||
throughput improvement on POWER10(Radix MMU + 64K Page Size) with
|
||||
MGLRU patches on top of 5.16 kernel for MongoDB + YCSB across
|
||||
three different request distributions, namely, Exponential, Uniform
|
||||
and Zipfan.
|
||||
|
||||
Shuang from U of Rochester reported [13]:
|
||||
With the MGLRU, fio achieved 95% CIs [38.95, 40.26]%, [4.12, 6.64]%
|
||||
and [9.26, 10.36]% higher throughput, respectively, for random
|
||||
access, Zipfian (distribution) access and Gaussian (distribution)
|
||||
access, when the average number of jobs per CPU is 1; 95% CIs
|
||||
[42.32, 49.15]%, [9.44, 9.89]% and [20.99, 22.86]% higher
|
||||
throughput, respectively, for random access, Zipfian access and
|
||||
Gaussian access, when the average number of jobs per CPU is 2.
|
||||
|
||||
Daniel from Michigan Tech reported [14]:
|
||||
With Memcached allocating ~100GB of byte-addressable Optante,
|
||||
performance improvement in terms of throughput (measured as queries
|
||||
per second) was about 10% for a series of workloads.
|
||||
|
||||
Large-scale deployments
|
||||
-----------------------
|
||||
We've rolled out MGLRU to tens of millions of ChromeOS users and
|
||||
about a million Android users. Google's fleetwide profiling [15] shows
|
||||
an overall 40% decrease in kswapd CPU usage, in addition to
|
||||
improvements in other UX metrics, e.g., an 85% decrease in the number
|
||||
of low-memory kills at the 75th percentile and an 18% decrease in
|
||||
app launch time at the 50th percentile.
|
||||
|
||||
The downstream kernels that have been using MGLRU include:
|
||||
1. Android [16]
|
||||
2. Arch Linux Zen [17]
|
||||
3. Armbian [18]
|
||||
4. ChromeOS [19]
|
||||
5. Liquorix [20]
|
||||
6. OpenWrt [21]
|
||||
7. post-factum [22]
|
||||
8. XanMod [23]
|
||||
|
||||
[11] https://lore.kernel.org/r/140226722f2032c86301fbd326d91baefe3d7d23.camel@yandex.ru/
|
||||
[12] https://lore.kernel.org/r/87czj3mux0.fsf@vajain21.in.ibm.com/
|
||||
[13] https://lore.kernel.org/r/20220105024423.26409-1-szhai2@cs.rochester.edu/
|
||||
[14] https://lore.kernel.org/r/CA+4-3vksGvKd18FgRinxhqHetBS1hQekJE2gwco8Ja-bJWKtFw@mail.gmail.com/
|
||||
[15] https://dl.acm.org/doi/10.1145/2749469.2750392
|
||||
[16] https://android.com
|
||||
[17] https://archlinux.org
|
||||
[18] https://armbian.com
|
||||
[19] https://chromium.org
|
||||
[20] https://liquorix.net
|
||||
[21] https://openwrt.org
|
||||
[22] https://codeberg.org/pf-kernel
|
||||
[23] https://xanmod.org
|
||||
|
||||
Summary
|
||||
=======
|
||||
The facts are:
|
||||
1. The independent lab results and the real-world applications
|
||||
indicate substantial improvements; there are no known regressions.
|
||||
2. Thrashing prevention, working set estimation and proactive reclaim
|
||||
work out of the box; there are no equivalent solutions.
|
||||
3. There is a lot of new code; no smaller changes have been
|
||||
demonstrated similar effects.
|
||||
|
||||
Our options, accordingly, are:
|
||||
1. Given the amount of evidence, the reported improvements will likely
|
||||
materialize for a wide range of workloads.
|
||||
2. Gauging the interest from the past discussions, the new features
|
||||
will likely be put to use for both personal computers and data
|
||||
centers.
|
||||
3. Based on Google's track record, the new code will likely be well
|
||||
maintained in the long term. It'd be more difficult if not
|
||||
impossible to achieve similar effects with other approaches.
|
||||
|
||||
This patch (of 14):
|
||||
|
||||
Some architectures automatically set the accessed bit in PTEs, e.g., x86
|
||||
and arm64 v8.2. On architectures that do not have this capability,
|
||||
clearing the accessed bit in a PTE usually triggers a page fault following
|
||||
the TLB miss of this PTE (to emulate the accessed bit).
|
||||
|
||||
Being aware of this capability can help make better decisions, e.g.,
|
||||
whether to spread the work out over a period of time to reduce bursty page
|
||||
faults when trying to clear the accessed bit in many PTEs.
|
||||
|
||||
Note that theoretically this capability can be unreliable, e.g.,
|
||||
hotplugged CPUs might be different from builtin ones. Therefore it should
|
||||
not be used in architecture-independent code that involves correctness,
|
||||
e.g., to determine whether TLB flushes are required (in combination with
|
||||
the accessed bit).
|
||||
|
||||
Link: https://lkml.kernel.org/r/20220918080010.2920238-1-yuzhao@google.com
|
||||
Link: https://lkml.kernel.org/r/20220918080010.2920238-2-yuzhao@google.com
|
||||
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
||||
Reviewed-by: Barry Song <baohua@kernel.org>
|
||||
Acked-by: Brian Geffon <bgeffon@google.com>
|
||||
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
|
||||
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
|
||||
Acked-by: Steven Barrett <steven@liquorix.net>
|
||||
Acked-by: Suleiman Souhlal <suleiman@google.com>
|
||||
Acked-by: Will Deacon <will@kernel.org>
|
||||
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
|
||||
Tested-by: Donald Carr <d@chaos-reins.com>
|
||||
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
|
||||
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
|
||||
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
|
||||
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
|
||||
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
|
||||
Cc: Andi Kleen <ak@linux.intel.com>
|
||||
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
|
||||
Cc: Catalin Marinas <catalin.marinas@arm.com>
|
||||
Cc: Dave Hansen <dave.hansen@linux.intel.com>
|
||||
Cc: Hillf Danton <hdanton@sina.com>
|
||||
Cc: Jens Axboe <axboe@kernel.dk>
|
||||
Cc: Johannes Weiner <hannes@cmpxchg.org>
|
||||
Cc: Jonathan Corbet <corbet@lwn.net>
|
||||
Cc: Linus Torvalds <torvalds@linux-foundation.org>
|
||||
Cc: linux-arm-kernel@lists.infradead.org
|
||||
Cc: Matthew Wilcox <willy@infradead.org>
|
||||
Cc: Mel Gorman <mgorman@suse.de>
|
||||
Cc: Michael Larabel <Michael@MichaelLarabel.com>
|
||||
Cc: Michal Hocko <mhocko@kernel.org>
|
||||
Cc: Mike Rapoport <rppt@kernel.org>
|
||||
Cc: Peter Zijlstra <peterz@infradead.org>
|
||||
Cc: Tejun Heo <tj@kernel.org>
|
||||
Cc: Vlastimil Babka <vbabka@suse.cz>
|
||||
Cc: Miaohe Lin <linmiaohe@huawei.com>
|
||||
Cc: Mike Rapoport <rppt@linux.ibm.com>
|
||||
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
arch/arm64/include/asm/pgtable.h | 14 ++------------
|
||||
arch/x86/include/asm/pgtable.h | 6 +++---
|
||||
include/linux/pgtable.h | 13 +++++++++++++
|
||||
mm/memory.c | 14 +-------------
|
||||
4 files changed, 19 insertions(+), 28 deletions(-)
|
||||
|
||||
--- a/arch/arm64/include/asm/pgtable.h
|
||||
+++ b/arch/arm64/include/asm/pgtable.h
|
||||
@@ -999,23 +999,13 @@ static inline void update_mmu_cache(stru
|
||||
* page after fork() + CoW for pfn mappings. We don't always have a
|
||||
* hardware-managed access flag on arm64.
|
||||
*/
|
||||
-static inline bool arch_faults_on_old_pte(void)
|
||||
-{
|
||||
- WARN_ON(preemptible());
|
||||
-
|
||||
- return !cpu_has_hw_af();
|
||||
-}
|
||||
-#define arch_faults_on_old_pte arch_faults_on_old_pte
|
||||
+#define arch_has_hw_pte_young cpu_has_hw_af
|
||||
|
||||
/*
|
||||
* Experimentally, it's cheap to set the access flag in hardware and we
|
||||
* benefit from prefaulting mappings as 'old' to start with.
|
||||
*/
|
||||
-static inline bool arch_wants_old_prefaulted_pte(void)
|
||||
-{
|
||||
- return !arch_faults_on_old_pte();
|
||||
-}
|
||||
-#define arch_wants_old_prefaulted_pte arch_wants_old_prefaulted_pte
|
||||
+#define arch_wants_old_prefaulted_pte cpu_has_hw_af
|
||||
|
||||
#endif /* !__ASSEMBLY__ */
|
||||
|
||||
--- a/arch/x86/include/asm/pgtable.h
|
||||
+++ b/arch/x86/include/asm/pgtable.h
|
||||
@@ -1397,10 +1397,10 @@ static inline bool arch_has_pfn_modify_c
|
||||
return boot_cpu_has_bug(X86_BUG_L1TF);
|
||||
}
|
||||
|
||||
-#define arch_faults_on_old_pte arch_faults_on_old_pte
|
||||
-static inline bool arch_faults_on_old_pte(void)
|
||||
+#define arch_has_hw_pte_young arch_has_hw_pte_young
|
||||
+static inline bool arch_has_hw_pte_young(void)
|
||||
{
|
||||
- return false;
|
||||
+ return true;
|
||||
}
|
||||
|
||||
#endif /* __ASSEMBLY__ */
|
||||
--- a/include/linux/pgtable.h
|
||||
+++ b/include/linux/pgtable.h
|
||||
@@ -259,6 +259,19 @@ static inline int pmdp_clear_flush_young
|
||||
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
|
||||
#endif
|
||||
|
||||
+#ifndef arch_has_hw_pte_young
|
||||
+/*
|
||||
+ * Return whether the accessed bit is supported on the local CPU.
|
||||
+ *
|
||||
+ * This stub assumes accessing through an old PTE triggers a page fault.
|
||||
+ * Architectures that automatically set the access bit should overwrite it.
|
||||
+ */
|
||||
+static inline bool arch_has_hw_pte_young(void)
|
||||
+{
|
||||
+ return false;
|
||||
+}
|
||||
+#endif
|
||||
+
|
||||
#ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR
|
||||
static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
|
||||
unsigned long address,
|
||||
--- a/mm/memory.c
|
||||
+++ b/mm/memory.c
|
||||
@@ -121,18 +121,6 @@ int randomize_va_space __read_mostly =
|
||||
2;
|
||||
#endif
|
||||
|
||||
-#ifndef arch_faults_on_old_pte
|
||||
-static inline bool arch_faults_on_old_pte(void)
|
||||
-{
|
||||
- /*
|
||||
- * Those arches which don't have hw access flag feature need to
|
||||
- * implement their own helper. By default, "true" means pagefault
|
||||
- * will be hit on old pte.
|
||||
- */
|
||||
- return true;
|
||||
-}
|
||||
-#endif
|
||||
-
|
||||
#ifndef arch_wants_old_prefaulted_pte
|
||||
static inline bool arch_wants_old_prefaulted_pte(void)
|
||||
{
|
||||
@@ -2782,7 +2770,7 @@ static inline bool cow_user_page(struct
|
||||
* On architectures with software "accessed" bits, we would
|
||||
* take a double page fault, so mark it accessed here.
|
||||
*/
|
||||
- if (arch_faults_on_old_pte() && !pte_young(vmf->orig_pte)) {
|
||||
+ if (!arch_has_hw_pte_young() && !pte_young(vmf->orig_pte)) {
|
||||
pte_t entry;
|
||||
|
||||
vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl);
|
@ -0,0 +1,153 @@
|
||||
From 493de1c4b0f2cd909169401da8c445f6c8a7e29d Mon Sep 17 00:00:00 2001
|
||||
From: Yu Zhao <yuzhao@google.com>
|
||||
Date: Sun, 18 Sep 2022 01:59:59 -0600
|
||||
Subject: [PATCH 02/29] mm: x86: add CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
Some architectures support the accessed bit in non-leaf PMD entries, e.g.,
|
||||
x86 sets the accessed bit in a non-leaf PMD entry when using it as part of
|
||||
linear address translation [1]. Page table walkers that clear the
|
||||
accessed bit may use this capability to reduce their search space.
|
||||
|
||||
Note that:
|
||||
1. Although an inline function is preferable, this capability is added
|
||||
as a configuration option for consistency with the existing macros.
|
||||
2. Due to the little interest in other varieties, this capability was
|
||||
only tested on Intel and AMD CPUs.
|
||||
|
||||
Thanks to the following developers for their efforts [2][3].
|
||||
Randy Dunlap <rdunlap@infradead.org>
|
||||
Stephen Rothwell <sfr@canb.auug.org.au>
|
||||
|
||||
[1]: Intel 64 and IA-32 Architectures Software Developer's Manual
|
||||
Volume 3 (June 2021), section 4.8
|
||||
[2] https://lore.kernel.org/r/bfdcc7c8-922f-61a9-aa15-7e7250f04af7@infradead.org/
|
||||
[3] https://lore.kernel.org/r/20220413151513.5a0d7a7e@canb.auug.org.au/
|
||||
|
||||
Link: https://lkml.kernel.org/r/20220918080010.2920238-3-yuzhao@google.com
|
||||
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
||||
Reviewed-by: Barry Song <baohua@kernel.org>
|
||||
Acked-by: Brian Geffon <bgeffon@google.com>
|
||||
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
|
||||
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
|
||||
Acked-by: Steven Barrett <steven@liquorix.net>
|
||||
Acked-by: Suleiman Souhlal <suleiman@google.com>
|
||||
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
|
||||
Tested-by: Donald Carr <d@chaos-reins.com>
|
||||
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
|
||||
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
|
||||
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
|
||||
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
|
||||
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
|
||||
Cc: Andi Kleen <ak@linux.intel.com>
|
||||
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
|
||||
Cc: Catalin Marinas <catalin.marinas@arm.com>
|
||||
Cc: Dave Hansen <dave.hansen@linux.intel.com>
|
||||
Cc: Hillf Danton <hdanton@sina.com>
|
||||
Cc: Jens Axboe <axboe@kernel.dk>
|
||||
Cc: Johannes Weiner <hannes@cmpxchg.org>
|
||||
Cc: Jonathan Corbet <corbet@lwn.net>
|
||||
Cc: Linus Torvalds <torvalds@linux-foundation.org>
|
||||
Cc: Matthew Wilcox <willy@infradead.org>
|
||||
Cc: Mel Gorman <mgorman@suse.de>
|
||||
Cc: Miaohe Lin <linmiaohe@huawei.com>
|
||||
Cc: Michael Larabel <Michael@MichaelLarabel.com>
|
||||
Cc: Michal Hocko <mhocko@kernel.org>
|
||||
Cc: Mike Rapoport <rppt@kernel.org>
|
||||
Cc: Mike Rapoport <rppt@linux.ibm.com>
|
||||
Cc: Peter Zijlstra <peterz@infradead.org>
|
||||
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
|
||||
Cc: Tejun Heo <tj@kernel.org>
|
||||
Cc: Vlastimil Babka <vbabka@suse.cz>
|
||||
Cc: Will Deacon <will@kernel.org>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
arch/Kconfig | 8 ++++++++
|
||||
arch/x86/Kconfig | 1 +
|
||||
arch/x86/include/asm/pgtable.h | 3 ++-
|
||||
arch/x86/mm/pgtable.c | 5 ++++-
|
||||
include/linux/pgtable.h | 4 ++--
|
||||
5 files changed, 17 insertions(+), 4 deletions(-)
|
||||
|
||||
--- a/arch/Kconfig
|
||||
+++ b/arch/Kconfig
|
||||
@@ -1295,6 +1295,14 @@ config ARCH_HAS_ELFCORE_COMPAT
|
||||
config ARCH_HAS_PARANOID_L1D_FLUSH
|
||||
bool
|
||||
|
||||
+config ARCH_HAS_NONLEAF_PMD_YOUNG
|
||||
+ bool
|
||||
+ help
|
||||
+ Architectures that select this option are capable of setting the
|
||||
+ accessed bit in non-leaf PMD entries when using them as part of linear
|
||||
+ address translations. Page table walkers that clear the accessed bit
|
||||
+ may use this capability to reduce their search space.
|
||||
+
|
||||
source "kernel/gcov/Kconfig"
|
||||
|
||||
source "scripts/gcc-plugins/Kconfig"
|
||||
--- a/arch/x86/Kconfig
|
||||
+++ b/arch/x86/Kconfig
|
||||
@@ -84,6 +84,7 @@ config X86
|
||||
select ARCH_HAS_PMEM_API if X86_64
|
||||
select ARCH_HAS_PTE_DEVMAP if X86_64
|
||||
select ARCH_HAS_PTE_SPECIAL
|
||||
+ select ARCH_HAS_NONLEAF_PMD_YOUNG if PGTABLE_LEVELS > 2
|
||||
select ARCH_HAS_UACCESS_FLUSHCACHE if X86_64
|
||||
select ARCH_HAS_COPY_MC if X86_64
|
||||
select ARCH_HAS_SET_MEMORY
|
||||
--- a/arch/x86/include/asm/pgtable.h
|
||||
+++ b/arch/x86/include/asm/pgtable.h
|
||||
@@ -817,7 +817,8 @@ static inline unsigned long pmd_page_vad
|
||||
|
||||
static inline int pmd_bad(pmd_t pmd)
|
||||
{
|
||||
- return (pmd_flags(pmd) & ~_PAGE_USER) != _KERNPG_TABLE;
|
||||
+ return (pmd_flags(pmd) & ~(_PAGE_USER | _PAGE_ACCESSED)) !=
|
||||
+ (_KERNPG_TABLE & ~_PAGE_ACCESSED);
|
||||
}
|
||||
|
||||
static inline unsigned long pages_to_mb(unsigned long npg)
|
||||
--- a/arch/x86/mm/pgtable.c
|
||||
+++ b/arch/x86/mm/pgtable.c
|
||||
@@ -550,7 +550,7 @@ int ptep_test_and_clear_young(struct vm_
|
||||
return ret;
|
||||
}
|
||||
|
||||
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
||||
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
|
||||
int pmdp_test_and_clear_young(struct vm_area_struct *vma,
|
||||
unsigned long addr, pmd_t *pmdp)
|
||||
{
|
||||
@@ -562,6 +562,9 @@ int pmdp_test_and_clear_young(struct vm_
|
||||
|
||||
return ret;
|
||||
}
|
||||
+#endif
|
||||
+
|
||||
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
||||
int pudp_test_and_clear_young(struct vm_area_struct *vma,
|
||||
unsigned long addr, pud_t *pudp)
|
||||
{
|
||||
--- a/include/linux/pgtable.h
|
||||
+++ b/include/linux/pgtable.h
|
||||
@@ -212,7 +212,7 @@ static inline int ptep_test_and_clear_yo
|
||||
#endif
|
||||
|
||||
#ifndef __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
|
||||
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
||||
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
|
||||
static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
|
||||
unsigned long address,
|
||||
pmd_t *pmdp)
|
||||
@@ -233,7 +233,7 @@ static inline int pmdp_test_and_clear_yo
|
||||
BUILD_BUG();
|
||||
return 0;
|
||||
}
|
||||
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
|
||||
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG */
|
||||
#endif
|
||||
|
||||
#ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
|
@ -0,0 +1,275 @@
|
||||
From 9e17efd11450d3d2069adaa3c58db9ac8ebd1c66 Mon Sep 17 00:00:00 2001
|
||||
From: Yu Zhao <yuzhao@google.com>
|
||||
Date: Sun, 18 Sep 2022 02:00:00 -0600
|
||||
Subject: [PATCH 03/29] mm/vmscan.c: refactor shrink_node()
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
This patch refactors shrink_node() to improve readability for the upcoming
|
||||
changes to mm/vmscan.c.
|
||||
|
||||
Link: https://lkml.kernel.org/r/20220918080010.2920238-4-yuzhao@google.com
|
||||
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
||||
Reviewed-by: Barry Song <baohua@kernel.org>
|
||||
Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
|
||||
Acked-by: Brian Geffon <bgeffon@google.com>
|
||||
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
|
||||
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
|
||||
Acked-by: Steven Barrett <steven@liquorix.net>
|
||||
Acked-by: Suleiman Souhlal <suleiman@google.com>
|
||||
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
|
||||
Tested-by: Donald Carr <d@chaos-reins.com>
|
||||
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
|
||||
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
|
||||
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
|
||||
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
|
||||
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
|
||||
Cc: Andi Kleen <ak@linux.intel.com>
|
||||
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
|
||||
Cc: Catalin Marinas <catalin.marinas@arm.com>
|
||||
Cc: Dave Hansen <dave.hansen@linux.intel.com>
|
||||
Cc: Hillf Danton <hdanton@sina.com>
|
||||
Cc: Jens Axboe <axboe@kernel.dk>
|
||||
Cc: Johannes Weiner <hannes@cmpxchg.org>
|
||||
Cc: Jonathan Corbet <corbet@lwn.net>
|
||||
Cc: Linus Torvalds <torvalds@linux-foundation.org>
|
||||
Cc: Matthew Wilcox <willy@infradead.org>
|
||||
Cc: Mel Gorman <mgorman@suse.de>
|
||||
Cc: Michael Larabel <Michael@MichaelLarabel.com>
|
||||
Cc: Michal Hocko <mhocko@kernel.org>
|
||||
Cc: Mike Rapoport <rppt@kernel.org>
|
||||
Cc: Mike Rapoport <rppt@linux.ibm.com>
|
||||
Cc: Peter Zijlstra <peterz@infradead.org>
|
||||
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
|
||||
Cc: Tejun Heo <tj@kernel.org>
|
||||
Cc: Vlastimil Babka <vbabka@suse.cz>
|
||||
Cc: Will Deacon <will@kernel.org>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
mm/vmscan.c | 198 +++++++++++++++++++++++++++-------------------------
|
||||
1 file changed, 104 insertions(+), 94 deletions(-)
|
||||
|
||||
--- a/mm/vmscan.c
|
||||
+++ b/mm/vmscan.c
|
||||
@@ -2497,6 +2497,109 @@ enum scan_balance {
|
||||
SCAN_FILE,
|
||||
};
|
||||
|
||||
+static void prepare_scan_count(pg_data_t *pgdat, struct scan_control *sc)
|
||||
+{
|
||||
+ unsigned long file;
|
||||
+ struct lruvec *target_lruvec;
|
||||
+
|
||||
+ target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
|
||||
+
|
||||
+ /*
|
||||
+ * Flush the memory cgroup stats, so that we read accurate per-memcg
|
||||
+ * lruvec stats for heuristics.
|
||||
+ */
|
||||
+ mem_cgroup_flush_stats();
|
||||
+
|
||||
+ /*
|
||||
+ * Determine the scan balance between anon and file LRUs.
|
||||
+ */
|
||||
+ spin_lock_irq(&target_lruvec->lru_lock);
|
||||
+ sc->anon_cost = target_lruvec->anon_cost;
|
||||
+ sc->file_cost = target_lruvec->file_cost;
|
||||
+ spin_unlock_irq(&target_lruvec->lru_lock);
|
||||
+
|
||||
+ /*
|
||||
+ * Target desirable inactive:active list ratios for the anon
|
||||
+ * and file LRU lists.
|
||||
+ */
|
||||
+ if (!sc->force_deactivate) {
|
||||
+ unsigned long refaults;
|
||||
+
|
||||
+ refaults = lruvec_page_state(target_lruvec,
|
||||
+ WORKINGSET_ACTIVATE_ANON);
|
||||
+ if (refaults != target_lruvec->refaults[0] ||
|
||||
+ inactive_is_low(target_lruvec, LRU_INACTIVE_ANON))
|
||||
+ sc->may_deactivate |= DEACTIVATE_ANON;
|
||||
+ else
|
||||
+ sc->may_deactivate &= ~DEACTIVATE_ANON;
|
||||
+
|
||||
+ /*
|
||||
+ * When refaults are being observed, it means a new
|
||||
+ * workingset is being established. Deactivate to get
|
||||
+ * rid of any stale active pages quickly.
|
||||
+ */
|
||||
+ refaults = lruvec_page_state(target_lruvec,
|
||||
+ WORKINGSET_ACTIVATE_FILE);
|
||||
+ if (refaults != target_lruvec->refaults[1] ||
|
||||
+ inactive_is_low(target_lruvec, LRU_INACTIVE_FILE))
|
||||
+ sc->may_deactivate |= DEACTIVATE_FILE;
|
||||
+ else
|
||||
+ sc->may_deactivate &= ~DEACTIVATE_FILE;
|
||||
+ } else
|
||||
+ sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE;
|
||||
+
|
||||
+ /*
|
||||
+ * If we have plenty of inactive file pages that aren't
|
||||
+ * thrashing, try to reclaim those first before touching
|
||||
+ * anonymous pages.
|
||||
+ */
|
||||
+ file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE);
|
||||
+ if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE))
|
||||
+ sc->cache_trim_mode = 1;
|
||||
+ else
|
||||
+ sc->cache_trim_mode = 0;
|
||||
+
|
||||
+ /*
|
||||
+ * Prevent the reclaimer from falling into the cache trap: as
|
||||
+ * cache pages start out inactive, every cache fault will tip
|
||||
+ * the scan balance towards the file LRU. And as the file LRU
|
||||
+ * shrinks, so does the window for rotation from references.
|
||||
+ * This means we have a runaway feedback loop where a tiny
|
||||
+ * thrashing file LRU becomes infinitely more attractive than
|
||||
+ * anon pages. Try to detect this based on file LRU size.
|
||||
+ */
|
||||
+ if (!cgroup_reclaim(sc)) {
|
||||
+ unsigned long total_high_wmark = 0;
|
||||
+ unsigned long free, anon;
|
||||
+ int z;
|
||||
+
|
||||
+ free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);
|
||||
+ file = node_page_state(pgdat, NR_ACTIVE_FILE) +
|
||||
+ node_page_state(pgdat, NR_INACTIVE_FILE);
|
||||
+
|
||||
+ for (z = 0; z < MAX_NR_ZONES; z++) {
|
||||
+ struct zone *zone = &pgdat->node_zones[z];
|
||||
+
|
||||
+ if (!managed_zone(zone))
|
||||
+ continue;
|
||||
+
|
||||
+ total_high_wmark += high_wmark_pages(zone);
|
||||
+ }
|
||||
+
|
||||
+ /*
|
||||
+ * Consider anon: if that's low too, this isn't a
|
||||
+ * runaway file reclaim problem, but rather just
|
||||
+ * extreme pressure. Reclaim as per usual then.
|
||||
+ */
|
||||
+ anon = node_page_state(pgdat, NR_INACTIVE_ANON);
|
||||
+
|
||||
+ sc->file_is_tiny =
|
||||
+ file + free <= total_high_wmark &&
|
||||
+ !(sc->may_deactivate & DEACTIVATE_ANON) &&
|
||||
+ anon >> sc->priority;
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
/*
|
||||
* Determine how aggressively the anon and file LRU lists should be
|
||||
* scanned. The relative value of each set of LRU lists is determined
|
||||
@@ -2965,109 +3068,16 @@ static void shrink_node(pg_data_t *pgdat
|
||||
unsigned long nr_reclaimed, nr_scanned;
|
||||
struct lruvec *target_lruvec;
|
||||
bool reclaimable = false;
|
||||
- unsigned long file;
|
||||
|
||||
target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
|
||||
|
||||
again:
|
||||
- /*
|
||||
- * Flush the memory cgroup stats, so that we read accurate per-memcg
|
||||
- * lruvec stats for heuristics.
|
||||
- */
|
||||
- mem_cgroup_flush_stats();
|
||||
-
|
||||
memset(&sc->nr, 0, sizeof(sc->nr));
|
||||
|
||||
nr_reclaimed = sc->nr_reclaimed;
|
||||
nr_scanned = sc->nr_scanned;
|
||||
|
||||
- /*
|
||||
- * Determine the scan balance between anon and file LRUs.
|
||||
- */
|
||||
- spin_lock_irq(&target_lruvec->lru_lock);
|
||||
- sc->anon_cost = target_lruvec->anon_cost;
|
||||
- sc->file_cost = target_lruvec->file_cost;
|
||||
- spin_unlock_irq(&target_lruvec->lru_lock);
|
||||
-
|
||||
- /*
|
||||
- * Target desirable inactive:active list ratios for the anon
|
||||
- * and file LRU lists.
|
||||
- */
|
||||
- if (!sc->force_deactivate) {
|
||||
- unsigned long refaults;
|
||||
-
|
||||
- refaults = lruvec_page_state(target_lruvec,
|
||||
- WORKINGSET_ACTIVATE_ANON);
|
||||
- if (refaults != target_lruvec->refaults[0] ||
|
||||
- inactive_is_low(target_lruvec, LRU_INACTIVE_ANON))
|
||||
- sc->may_deactivate |= DEACTIVATE_ANON;
|
||||
- else
|
||||
- sc->may_deactivate &= ~DEACTIVATE_ANON;
|
||||
-
|
||||
- /*
|
||||
- * When refaults are being observed, it means a new
|
||||
- * workingset is being established. Deactivate to get
|
||||
- * rid of any stale active pages quickly.
|
||||
- */
|
||||
- refaults = lruvec_page_state(target_lruvec,
|
||||
- WORKINGSET_ACTIVATE_FILE);
|
||||
- if (refaults != target_lruvec->refaults[1] ||
|
||||
- inactive_is_low(target_lruvec, LRU_INACTIVE_FILE))
|
||||
- sc->may_deactivate |= DEACTIVATE_FILE;
|
||||
- else
|
||||
- sc->may_deactivate &= ~DEACTIVATE_FILE;
|
||||
- } else
|
||||
- sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE;
|
||||
-
|
||||
- /*
|
||||
- * If we have plenty of inactive file pages that aren't
|
||||
- * thrashing, try to reclaim those first before touching
|
||||
- * anonymous pages.
|
||||
- */
|
||||
- file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE);
|
||||
- if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE))
|
||||
- sc->cache_trim_mode = 1;
|
||||
- else
|
||||
- sc->cache_trim_mode = 0;
|
||||
-
|
||||
- /*
|
||||
- * Prevent the reclaimer from falling into the cache trap: as
|
||||
- * cache pages start out inactive, every cache fault will tip
|
||||
- * the scan balance towards the file LRU. And as the file LRU
|
||||
- * shrinks, so does the window for rotation from references.
|
||||
- * This means we have a runaway feedback loop where a tiny
|
||||
- * thrashing file LRU becomes infinitely more attractive than
|
||||
- * anon pages. Try to detect this based on file LRU size.
|
||||
- */
|
||||
- if (!cgroup_reclaim(sc)) {
|
||||
- unsigned long total_high_wmark = 0;
|
||||
- unsigned long free, anon;
|
||||
- int z;
|
||||
-
|
||||
- free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);
|
||||
- file = node_page_state(pgdat, NR_ACTIVE_FILE) +
|
||||
- node_page_state(pgdat, NR_INACTIVE_FILE);
|
||||
-
|
||||
- for (z = 0; z < MAX_NR_ZONES; z++) {
|
||||
- struct zone *zone = &pgdat->node_zones[z];
|
||||
- if (!managed_zone(zone))
|
||||
- continue;
|
||||
-
|
||||
- total_high_wmark += high_wmark_pages(zone);
|
||||
- }
|
||||
-
|
||||
- /*
|
||||
- * Consider anon: if that's low too, this isn't a
|
||||
- * runaway file reclaim problem, but rather just
|
||||
- * extreme pressure. Reclaim as per usual then.
|
||||
- */
|
||||
- anon = node_page_state(pgdat, NR_INACTIVE_ANON);
|
||||
-
|
||||
- sc->file_is_tiny =
|
||||
- file + free <= total_high_wmark &&
|
||||
- !(sc->may_deactivate & DEACTIVATE_ANON) &&
|
||||
- anon >> sc->priority;
|
||||
- }
|
||||
+ prepare_scan_count(pgdat, sc);
|
||||
|
||||
shrink_node_memcgs(pgdat, sc);
|
||||
|
@ -0,0 +1,82 @@
|
||||
From 03705be42114db7cc5bd6eb7bf7e8703c94d4880 Mon Sep 17 00:00:00 2001
|
||||
From: Yu Zhao <yuzhao@google.com>
|
||||
Date: Sun, 18 Sep 2022 02:00:01 -0600
|
||||
Subject: [PATCH 04/29] Revert "include/linux/mm_inline.h: fold
|
||||
__update_lru_size() into its sole caller"
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
This patch undoes the following refactor: commit 289ccba18af4
|
||||
("include/linux/mm_inline.h: fold __update_lru_size() into its sole
|
||||
caller")
|
||||
|
||||
The upcoming changes to include/linux/mm_inline.h will reuse
|
||||
__update_lru_size().
|
||||
|
||||
Link: https://lkml.kernel.org/r/20220918080010.2920238-5-yuzhao@google.com
|
||||
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
||||
Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
|
||||
Acked-by: Brian Geffon <bgeffon@google.com>
|
||||
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
|
||||
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
|
||||
Acked-by: Steven Barrett <steven@liquorix.net>
|
||||
Acked-by: Suleiman Souhlal <suleiman@google.com>
|
||||
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
|
||||
Tested-by: Donald Carr <d@chaos-reins.com>
|
||||
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
|
||||
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
|
||||
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
|
||||
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
|
||||
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
|
||||
Cc: Andi Kleen <ak@linux.intel.com>
|
||||
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
|
||||
Cc: Barry Song <baohua@kernel.org>
|
||||
Cc: Catalin Marinas <catalin.marinas@arm.com>
|
||||
Cc: Dave Hansen <dave.hansen@linux.intel.com>
|
||||
Cc: Hillf Danton <hdanton@sina.com>
|
||||
Cc: Jens Axboe <axboe@kernel.dk>
|
||||
Cc: Johannes Weiner <hannes@cmpxchg.org>
|
||||
Cc: Jonathan Corbet <corbet@lwn.net>
|
||||
Cc: Linus Torvalds <torvalds@linux-foundation.org>
|
||||
Cc: Matthew Wilcox <willy@infradead.org>
|
||||
Cc: Mel Gorman <mgorman@suse.de>
|
||||
Cc: Michael Larabel <Michael@MichaelLarabel.com>
|
||||
Cc: Michal Hocko <mhocko@kernel.org>
|
||||
Cc: Mike Rapoport <rppt@kernel.org>
|
||||
Cc: Mike Rapoport <rppt@linux.ibm.com>
|
||||
Cc: Peter Zijlstra <peterz@infradead.org>
|
||||
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
|
||||
Cc: Tejun Heo <tj@kernel.org>
|
||||
Cc: Vlastimil Babka <vbabka@suse.cz>
|
||||
Cc: Will Deacon <will@kernel.org>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
include/linux/mm_inline.h | 9 ++++++++-
|
||||
1 file changed, 8 insertions(+), 1 deletion(-)
|
||||
|
||||
--- a/include/linux/mm_inline.h
|
||||
+++ b/include/linux/mm_inline.h
|
||||
@@ -24,7 +24,7 @@ static inline int page_is_file_lru(struc
|
||||
return !PageSwapBacked(page);
|
||||
}
|
||||
|
||||
-static __always_inline void update_lru_size(struct lruvec *lruvec,
|
||||
+static __always_inline void __update_lru_size(struct lruvec *lruvec,
|
||||
enum lru_list lru, enum zone_type zid,
|
||||
int nr_pages)
|
||||
{
|
||||
@@ -33,6 +33,13 @@ static __always_inline void update_lru_s
|
||||
__mod_lruvec_state(lruvec, NR_LRU_BASE + lru, nr_pages);
|
||||
__mod_zone_page_state(&pgdat->node_zones[zid],
|
||||
NR_ZONE_LRU_BASE + lru, nr_pages);
|
||||
+}
|
||||
+
|
||||
+static __always_inline void update_lru_size(struct lruvec *lruvec,
|
||||
+ enum lru_list lru, enum zone_type zid,
|
||||
+ long nr_pages)
|
||||
+{
|
||||
+ __update_lru_size(lruvec, lru, zid, nr_pages);
|
||||
#ifdef CONFIG_MEMCG
|
||||
mem_cgroup_update_lru_size(lruvec, lru, zid, nr_pages);
|
||||
#endif
|
@ -0,0 +1,807 @@
|
||||
From a9b328add8422921a0dbbef162730800e16e8cfd Mon Sep 17 00:00:00 2001
|
||||
From: Yu Zhao <yuzhao@google.com>
|
||||
Date: Sun, 18 Sep 2022 02:00:02 -0600
|
||||
Subject: [PATCH 05/29] mm: multi-gen LRU: groundwork
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
Evictable pages are divided into multiple generations for each lruvec.
|
||||
The youngest generation number is stored in lrugen->max_seq for both
|
||||
anon and file types as they are aged on an equal footing. The oldest
|
||||
generation numbers are stored in lrugen->min_seq[] separately for anon
|
||||
and file types as clean file pages can be evicted regardless of swap
|
||||
constraints. These three variables are monotonically increasing.
|
||||
|
||||
Generation numbers are truncated into order_base_2(MAX_NR_GENS+1) bits
|
||||
in order to fit into the gen counter in page->flags. Each truncated
|
||||
generation number is an index to lrugen->lists[]. The sliding window
|
||||
technique is used to track at least MIN_NR_GENS and at most
|
||||
MAX_NR_GENS generations. The gen counter stores a value within [1,
|
||||
MAX_NR_GENS] while a page is on one of lrugen->lists[]. Otherwise it
|
||||
stores 0.
|
||||
|
||||
There are two conceptually independent procedures: "the aging", which
|
||||
produces young generations, and "the eviction", which consumes old
|
||||
generations. They form a closed-loop system, i.e., "the page reclaim".
|
||||
Both procedures can be invoked from userspace for the purposes of working
|
||||
set estimation and proactive reclaim. These techniques are commonly used
|
||||
to optimize job scheduling (bin packing) in data centers [1][2].
|
||||
|
||||
To avoid confusion, the terms "hot" and "cold" will be applied to the
|
||||
multi-gen LRU, as a new convention; the terms "active" and "inactive" will
|
||||
be applied to the active/inactive LRU, as usual.
|
||||
|
||||
The protection of hot pages and the selection of cold pages are based
|
||||
on page access channels and patterns. There are two access channels:
|
||||
one through page tables and the other through file descriptors. The
|
||||
protection of the former channel is by design stronger because:
|
||||
1. The uncertainty in determining the access patterns of the former
|
||||
channel is higher due to the approximation of the accessed bit.
|
||||
2. The cost of evicting the former channel is higher due to the TLB
|
||||
flushes required and the likelihood of encountering the dirty bit.
|
||||
3. The penalty of underprotecting the former channel is higher because
|
||||
applications usually do not prepare themselves for major page
|
||||
faults like they do for blocked I/O. E.g., GUI applications
|
||||
commonly use dedicated I/O threads to avoid blocking rendering
|
||||
threads.
|
||||
|
||||
There are also two access patterns: one with temporal locality and the
|
||||
other without. For the reasons listed above, the former channel is
|
||||
assumed to follow the former pattern unless VM_SEQ_READ or VM_RAND_READ is
|
||||
present; the latter channel is assumed to follow the latter pattern unless
|
||||
outlying refaults have been observed [3][4].
|
||||
|
||||
The next patch will address the "outlying refaults". Three macros, i.e.,
|
||||
LRU_REFS_WIDTH, LRU_REFS_PGOFF and LRU_REFS_MASK, used later are added in
|
||||
this patch to make the entire patchset less diffy.
|
||||
|
||||
A page is added to the youngest generation on faulting. The aging needs
|
||||
to check the accessed bit at least twice before handing this page over to
|
||||
the eviction. The first check takes care of the accessed bit set on the
|
||||
initial fault; the second check makes sure this page has not been used
|
||||
since then. This protocol, AKA second chance, requires a minimum of two
|
||||
generations, hence MIN_NR_GENS.
|
||||
|
||||
[1] https://dl.acm.org/doi/10.1145/3297858.3304053
|
||||
[2] https://dl.acm.org/doi/10.1145/3503222.3507731
|
||||
[3] https://lwn.net/Articles/495543/
|
||||
[4] https://lwn.net/Articles/815342/
|
||||
|
||||
Link: https://lkml.kernel.org/r/20220918080010.2920238-6-yuzhao@google.com
|
||||
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
||||
Acked-by: Brian Geffon <bgeffon@google.com>
|
||||
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
|
||||
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
|
||||
Acked-by: Steven Barrett <steven@liquorix.net>
|
||||
Acked-by: Suleiman Souhlal <suleiman@google.com>
|
||||
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
|
||||
Tested-by: Donald Carr <d@chaos-reins.com>
|
||||
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
|
||||
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
|
||||
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
|
||||
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
|
||||
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
|
||||
Cc: Andi Kleen <ak@linux.intel.com>
|
||||
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
|
||||
Cc: Barry Song <baohua@kernel.org>
|
||||
Cc: Catalin Marinas <catalin.marinas@arm.com>
|
||||
Cc: Dave Hansen <dave.hansen@linux.intel.com>
|
||||
Cc: Hillf Danton <hdanton@sina.com>
|
||||
Cc: Jens Axboe <axboe@kernel.dk>
|
||||
Cc: Johannes Weiner <hannes@cmpxchg.org>
|
||||
Cc: Jonathan Corbet <corbet@lwn.net>
|
||||
Cc: Linus Torvalds <torvalds@linux-foundation.org>
|
||||
Cc: Matthew Wilcox <willy@infradead.org>
|
||||
Cc: Mel Gorman <mgorman@suse.de>
|
||||
Cc: Miaohe Lin <linmiaohe@huawei.com>
|
||||
Cc: Michael Larabel <Michael@MichaelLarabel.com>
|
||||
Cc: Michal Hocko <mhocko@kernel.org>
|
||||
Cc: Mike Rapoport <rppt@kernel.org>
|
||||
Cc: Mike Rapoport <rppt@linux.ibm.com>
|
||||
Cc: Peter Zijlstra <peterz@infradead.org>
|
||||
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
|
||||
Cc: Tejun Heo <tj@kernel.org>
|
||||
Cc: Vlastimil Babka <vbabka@suse.cz>
|
||||
Cc: Will Deacon <will@kernel.org>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
fs/fuse/dev.c | 3 +-
|
||||
include/linux/mm.h | 2 +
|
||||
include/linux/mm_inline.h | 177 +++++++++++++++++++++++++++++-
|
||||
include/linux/mmzone.h | 100 +++++++++++++++++
|
||||
include/linux/page-flags-layout.h | 13 ++-
|
||||
include/linux/page-flags.h | 4 +-
|
||||
include/linux/sched.h | 4 +
|
||||
kernel/bounds.c | 5 +
|
||||
mm/Kconfig | 8 ++
|
||||
mm/huge_memory.c | 3 +-
|
||||
mm/memcontrol.c | 2 +
|
||||
mm/memory.c | 25 +++++
|
||||
mm/mm_init.c | 6 +-
|
||||
mm/mmzone.c | 2 +
|
||||
mm/swap.c | 10 +-
|
||||
mm/vmscan.c | 75 +++++++++++++
|
||||
16 files changed, 425 insertions(+), 14 deletions(-)
|
||||
|
||||
--- a/fs/fuse/dev.c
|
||||
+++ b/fs/fuse/dev.c
|
||||
@@ -785,7 +785,8 @@ static int fuse_check_page(struct page *
|
||||
1 << PG_active |
|
||||
1 << PG_workingset |
|
||||
1 << PG_reclaim |
|
||||
- 1 << PG_waiters))) {
|
||||
+ 1 << PG_waiters |
|
||||
+ LRU_GEN_MASK | LRU_REFS_MASK))) {
|
||||
dump_page(page, "fuse: trying to steal weird page");
|
||||
return 1;
|
||||
}
|
||||
--- a/include/linux/mm.h
|
||||
+++ b/include/linux/mm.h
|
||||
@@ -1093,6 +1093,8 @@ vm_fault_t finish_mkwrite_fault(struct v
|
||||
#define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH)
|
||||
#define LAST_CPUPID_PGOFF (ZONES_PGOFF - LAST_CPUPID_WIDTH)
|
||||
#define KASAN_TAG_PGOFF (LAST_CPUPID_PGOFF - KASAN_TAG_WIDTH)
|
||||
+#define LRU_GEN_PGOFF (KASAN_TAG_PGOFF - LRU_GEN_WIDTH)
|
||||
+#define LRU_REFS_PGOFF (LRU_GEN_PGOFF - LRU_REFS_WIDTH)
|
||||
|
||||
/*
|
||||
* Define the bit shifts to access each section. For non-existent
|
||||
--- a/include/linux/mm_inline.h
|
||||
+++ b/include/linux/mm_inline.h
|
||||
@@ -26,10 +26,13 @@ static inline int page_is_file_lru(struc
|
||||
|
||||
static __always_inline void __update_lru_size(struct lruvec *lruvec,
|
||||
enum lru_list lru, enum zone_type zid,
|
||||
- int nr_pages)
|
||||
+ long nr_pages)
|
||||
{
|
||||
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
|
||||
|
||||
+ lockdep_assert_held(&lruvec->lru_lock);
|
||||
+ WARN_ON_ONCE(nr_pages != (int)nr_pages);
|
||||
+
|
||||
__mod_lruvec_state(lruvec, NR_LRU_BASE + lru, nr_pages);
|
||||
__mod_zone_page_state(&pgdat->node_zones[zid],
|
||||
NR_ZONE_LRU_BASE + lru, nr_pages);
|
||||
@@ -86,11 +89,177 @@ static __always_inline enum lru_list pag
|
||||
return lru;
|
||||
}
|
||||
|
||||
+#ifdef CONFIG_LRU_GEN
|
||||
+
|
||||
+static inline bool lru_gen_enabled(void)
|
||||
+{
|
||||
+ return true;
|
||||
+}
|
||||
+
|
||||
+static inline bool lru_gen_in_fault(void)
|
||||
+{
|
||||
+ return current->in_lru_fault;
|
||||
+}
|
||||
+
|
||||
+static inline int lru_gen_from_seq(unsigned long seq)
|
||||
+{
|
||||
+ return seq % MAX_NR_GENS;
|
||||
+}
|
||||
+
|
||||
+static inline int page_lru_gen(struct page *page)
|
||||
+{
|
||||
+ unsigned long flags = READ_ONCE(page->flags);
|
||||
+
|
||||
+ return ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
|
||||
+}
|
||||
+
|
||||
+static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen)
|
||||
+{
|
||||
+ unsigned long max_seq = lruvec->lrugen.max_seq;
|
||||
+
|
||||
+ VM_WARN_ON_ONCE(gen >= MAX_NR_GENS);
|
||||
+
|
||||
+ /* see the comment on MIN_NR_GENS */
|
||||
+ return gen == lru_gen_from_seq(max_seq) || gen == lru_gen_from_seq(max_seq - 1);
|
||||
+}
|
||||
+
|
||||
+static inline void lru_gen_update_size(struct lruvec *lruvec, struct page *page,
|
||||
+ int old_gen, int new_gen)
|
||||
+{
|
||||
+ int type = page_is_file_lru(page);
|
||||
+ int zone = page_zonenum(page);
|
||||
+ int delta = thp_nr_pages(page);
|
||||
+ enum lru_list lru = type * LRU_INACTIVE_FILE;
|
||||
+ struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
||||
+
|
||||
+ VM_WARN_ON_ONCE(old_gen != -1 && old_gen >= MAX_NR_GENS);
|
||||
+ VM_WARN_ON_ONCE(new_gen != -1 && new_gen >= MAX_NR_GENS);
|
||||
+ VM_WARN_ON_ONCE(old_gen == -1 && new_gen == -1);
|
||||
+
|
||||
+ if (old_gen >= 0)
|
||||
+ WRITE_ONCE(lrugen->nr_pages[old_gen][type][zone],
|
||||
+ lrugen->nr_pages[old_gen][type][zone] - delta);
|
||||
+ if (new_gen >= 0)
|
||||
+ WRITE_ONCE(lrugen->nr_pages[new_gen][type][zone],
|
||||
+ lrugen->nr_pages[new_gen][type][zone] + delta);
|
||||
+
|
||||
+ /* addition */
|
||||
+ if (old_gen < 0) {
|
||||
+ if (lru_gen_is_active(lruvec, new_gen))
|
||||
+ lru += LRU_ACTIVE;
|
||||
+ __update_lru_size(lruvec, lru, zone, delta);
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
+ /* deletion */
|
||||
+ if (new_gen < 0) {
|
||||
+ if (lru_gen_is_active(lruvec, old_gen))
|
||||
+ lru += LRU_ACTIVE;
|
||||
+ __update_lru_size(lruvec, lru, zone, -delta);
|
||||
+ return;
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
+static inline bool lru_gen_add_page(struct lruvec *lruvec, struct page *page, bool reclaiming)
|
||||
+{
|
||||
+ unsigned long seq;
|
||||
+ unsigned long flags;
|
||||
+ int gen = page_lru_gen(page);
|
||||
+ int type = page_is_file_lru(page);
|
||||
+ int zone = page_zonenum(page);
|
||||
+ struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
||||
+
|
||||
+ VM_WARN_ON_ONCE_PAGE(gen != -1, page);
|
||||
+
|
||||
+ if (PageUnevictable(page))
|
||||
+ return false;
|
||||
+ /*
|
||||
+ * There are three common cases for this page:
|
||||
+ * 1. If it's hot, e.g., freshly faulted in or previously hot and
|
||||
+ * migrated, add it to the youngest generation.
|
||||
+ * 2. If it's cold but can't be evicted immediately, i.e., an anon page
|
||||
+ * not in swapcache or a dirty page pending writeback, add it to the
|
||||
+ * second oldest generation.
|
||||
+ * 3. Everything else (clean, cold) is added to the oldest generation.
|
||||
+ */
|
||||
+ if (PageActive(page))
|
||||
+ seq = lrugen->max_seq;
|
||||
+ else if ((type == LRU_GEN_ANON && !PageSwapCache(page)) ||
|
||||
+ (PageReclaim(page) &&
|
||||
+ (PageDirty(page) || PageWriteback(page))))
|
||||
+ seq = lrugen->min_seq[type] + 1;
|
||||
+ else
|
||||
+ seq = lrugen->min_seq[type];
|
||||
+
|
||||
+ gen = lru_gen_from_seq(seq);
|
||||
+ flags = (gen + 1UL) << LRU_GEN_PGOFF;
|
||||
+ /* see the comment on MIN_NR_GENS about PG_active */
|
||||
+ set_mask_bits(&page->flags, LRU_GEN_MASK | BIT(PG_active), flags);
|
||||
+
|
||||
+ lru_gen_update_size(lruvec, page, -1, gen);
|
||||
+ /* for rotate_reclaimable_page() */
|
||||
+ if (reclaiming)
|
||||
+ list_add_tail(&page->lru, &lrugen->lists[gen][type][zone]);
|
||||
+ else
|
||||
+ list_add(&page->lru, &lrugen->lists[gen][type][zone]);
|
||||
+
|
||||
+ return true;
|
||||
+}
|
||||
+
|
||||
+static inline bool lru_gen_del_page(struct lruvec *lruvec, struct page *page, bool reclaiming)
|
||||
+{
|
||||
+ unsigned long flags;
|
||||
+ int gen = page_lru_gen(page);
|
||||
+
|
||||
+ if (gen < 0)
|
||||
+ return false;
|
||||
+
|
||||
+ VM_WARN_ON_ONCE_PAGE(PageActive(page), page);
|
||||
+ VM_WARN_ON_ONCE_PAGE(PageUnevictable(page), page);
|
||||
+
|
||||
+ /* for migrate_page_states() */
|
||||
+ flags = !reclaiming && lru_gen_is_active(lruvec, gen) ? BIT(PG_active) : 0;
|
||||
+ flags = set_mask_bits(&page->flags, LRU_GEN_MASK, flags);
|
||||
+ gen = ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
|
||||
+
|
||||
+ lru_gen_update_size(lruvec, page, gen, -1);
|
||||
+ list_del(&page->lru);
|
||||
+
|
||||
+ return true;
|
||||
+}
|
||||
+
|
||||
+#else /* !CONFIG_LRU_GEN */
|
||||
+
|
||||
+static inline bool lru_gen_enabled(void)
|
||||
+{
|
||||
+ return false;
|
||||
+}
|
||||
+
|
||||
+static inline bool lru_gen_in_fault(void)
|
||||
+{
|
||||
+ return false;
|
||||
+}
|
||||
+
|
||||
+static inline bool lru_gen_add_page(struct lruvec *lruvec, struct page *page, bool reclaiming)
|
||||
+{
|
||||
+ return false;
|
||||
+}
|
||||
+
|
||||
+static inline bool lru_gen_del_page(struct lruvec *lruvec, struct page *page, bool reclaiming)
|
||||
+{
|
||||
+ return false;
|
||||
+}
|
||||
+
|
||||
+#endif /* CONFIG_LRU_GEN */
|
||||
+
|
||||
static __always_inline void add_page_to_lru_list(struct page *page,
|
||||
struct lruvec *lruvec)
|
||||
{
|
||||
enum lru_list lru = page_lru(page);
|
||||
|
||||
+ if (lru_gen_add_page(lruvec, page, false))
|
||||
+ return;
|
||||
+
|
||||
update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page));
|
||||
list_add(&page->lru, &lruvec->lists[lru]);
|
||||
}
|
||||
@@ -100,6 +269,9 @@ static __always_inline void add_page_to_
|
||||
{
|
||||
enum lru_list lru = page_lru(page);
|
||||
|
||||
+ if (lru_gen_add_page(lruvec, page, true))
|
||||
+ return;
|
||||
+
|
||||
update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page));
|
||||
list_add_tail(&page->lru, &lruvec->lists[lru]);
|
||||
}
|
||||
@@ -107,6 +279,9 @@ static __always_inline void add_page_to_
|
||||
static __always_inline void del_page_from_lru_list(struct page *page,
|
||||
struct lruvec *lruvec)
|
||||
{
|
||||
+ if (lru_gen_del_page(lruvec, page, false))
|
||||
+ return;
|
||||
+
|
||||
list_del(&page->lru);
|
||||
update_lru_size(lruvec, page_lru(page), page_zonenum(page),
|
||||
-thp_nr_pages(page));
|
||||
--- a/include/linux/mmzone.h
|
||||
+++ b/include/linux/mmzone.h
|
||||
@@ -294,6 +294,102 @@ enum lruvec_flags {
|
||||
*/
|
||||
};
|
||||
|
||||
+#endif /* !__GENERATING_BOUNDS_H */
|
||||
+
|
||||
+/*
|
||||
+ * Evictable pages are divided into multiple generations. The youngest and the
|
||||
+ * oldest generation numbers, max_seq and min_seq, are monotonically increasing.
|
||||
+ * They form a sliding window of a variable size [MIN_NR_GENS, MAX_NR_GENS]. An
|
||||
+ * offset within MAX_NR_GENS, i.e., gen, indexes the LRU list of the
|
||||
+ * corresponding generation. The gen counter in page->flags stores gen+1 while
|
||||
+ * a page is on one of lrugen->lists[]. Otherwise it stores 0.
|
||||
+ *
|
||||
+ * A page is added to the youngest generation on faulting. The aging needs to
|
||||
+ * check the accessed bit at least twice before handing this page over to the
|
||||
+ * eviction. The first check takes care of the accessed bit set on the initial
|
||||
+ * fault; the second check makes sure this page hasn't been used since then.
|
||||
+ * This process, AKA second chance, requires a minimum of two generations,
|
||||
+ * hence MIN_NR_GENS. And to maintain ABI compatibility with the active/inactive
|
||||
+ * LRU, e.g., /proc/vmstat, these two generations are considered active; the
|
||||
+ * rest of generations, if they exist, are considered inactive. See
|
||||
+ * lru_gen_is_active().
|
||||
+ *
|
||||
+ * PG_active is always cleared while a page is on one of lrugen->lists[] so that
|
||||
+ * the aging needs not to worry about it. And it's set again when a page
|
||||
+ * considered active is isolated for non-reclaiming purposes, e.g., migration.
|
||||
+ * See lru_gen_add_page() and lru_gen_del_page().
|
||||
+ *
|
||||
+ * MAX_NR_GENS is set to 4 so that the multi-gen LRU can support twice the
|
||||
+ * number of categories of the active/inactive LRU when keeping track of
|
||||
+ * accesses through page tables. This requires order_base_2(MAX_NR_GENS+1) bits
|
||||
+ * in page->flags.
|
||||
+ */
|
||||
+#define MIN_NR_GENS 2U
|
||||
+#define MAX_NR_GENS 4U
|
||||
+
|
||||
+#ifndef __GENERATING_BOUNDS_H
|
||||
+
|
||||
+struct lruvec;
|
||||
+
|
||||
+#define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF)
|
||||
+#define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF)
|
||||
+
|
||||
+#ifdef CONFIG_LRU_GEN
|
||||
+
|
||||
+enum {
|
||||
+ LRU_GEN_ANON,
|
||||
+ LRU_GEN_FILE,
|
||||
+};
|
||||
+
|
||||
+/*
|
||||
+ * The youngest generation number is stored in max_seq for both anon and file
|
||||
+ * types as they are aged on an equal footing. The oldest generation numbers are
|
||||
+ * stored in min_seq[] separately for anon and file types as clean file pages
|
||||
+ * can be evicted regardless of swap constraints.
|
||||
+ *
|
||||
+ * Normally anon and file min_seq are in sync. But if swapping is constrained,
|
||||
+ * e.g., out of swap space, file min_seq is allowed to advance and leave anon
|
||||
+ * min_seq behind.
|
||||
+ *
|
||||
+ * The number of pages in each generation is eventually consistent and therefore
|
||||
+ * can be transiently negative.
|
||||
+ */
|
||||
+struct lru_gen_struct {
|
||||
+ /* the aging increments the youngest generation number */
|
||||
+ unsigned long max_seq;
|
||||
+ /* the eviction increments the oldest generation numbers */
|
||||
+ unsigned long min_seq[ANON_AND_FILE];
|
||||
+ /* the multi-gen LRU lists, lazily sorted on eviction */
|
||||
+ struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
|
||||
+ /* the multi-gen LRU sizes, eventually consistent */
|
||||
+ long nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
|
||||
+};
|
||||
+
|
||||
+void lru_gen_init_lruvec(struct lruvec *lruvec);
|
||||
+
|
||||
+#ifdef CONFIG_MEMCG
|
||||
+void lru_gen_init_memcg(struct mem_cgroup *memcg);
|
||||
+void lru_gen_exit_memcg(struct mem_cgroup *memcg);
|
||||
+#endif
|
||||
+
|
||||
+#else /* !CONFIG_LRU_GEN */
|
||||
+
|
||||
+static inline void lru_gen_init_lruvec(struct lruvec *lruvec)
|
||||
+{
|
||||
+}
|
||||
+
|
||||
+#ifdef CONFIG_MEMCG
|
||||
+static inline void lru_gen_init_memcg(struct mem_cgroup *memcg)
|
||||
+{
|
||||
+}
|
||||
+
|
||||
+static inline void lru_gen_exit_memcg(struct mem_cgroup *memcg)
|
||||
+{
|
||||
+}
|
||||
+#endif
|
||||
+
|
||||
+#endif /* CONFIG_LRU_GEN */
|
||||
+
|
||||
struct lruvec {
|
||||
struct list_head lists[NR_LRU_LISTS];
|
||||
/* per lruvec lru_lock for memcg */
|
||||
@@ -311,6 +407,10 @@ struct lruvec {
|
||||
unsigned long refaults[ANON_AND_FILE];
|
||||
/* Various lruvec state flags (enum lruvec_flags) */
|
||||
unsigned long flags;
|
||||
+#ifdef CONFIG_LRU_GEN
|
||||
+ /* evictable pages divided into generations */
|
||||
+ struct lru_gen_struct lrugen;
|
||||
+#endif
|
||||
#ifdef CONFIG_MEMCG
|
||||
struct pglist_data *pgdat;
|
||||
#endif
|
||||
--- a/include/linux/page-flags-layout.h
|
||||
+++ b/include/linux/page-flags-layout.h
|
||||
@@ -55,7 +55,8 @@
|
||||
#define SECTIONS_WIDTH 0
|
||||
#endif
|
||||
|
||||
-#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS
|
||||
+#if ZONES_WIDTH + LRU_GEN_WIDTH + SECTIONS_WIDTH + NODES_SHIFT \
|
||||
+ <= BITS_PER_LONG - NR_PAGEFLAGS
|
||||
#define NODES_WIDTH NODES_SHIFT
|
||||
#elif defined(CONFIG_SPARSEMEM_VMEMMAP)
|
||||
#error "Vmemmap: No space for nodes field in page flags"
|
||||
@@ -89,8 +90,8 @@
|
||||
#define LAST_CPUPID_SHIFT 0
|
||||
#endif
|
||||
|
||||
-#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + KASAN_TAG_WIDTH + LAST_CPUPID_SHIFT \
|
||||
- <= BITS_PER_LONG - NR_PAGEFLAGS
|
||||
+#if ZONES_WIDTH + LRU_GEN_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + \
|
||||
+ KASAN_TAG_WIDTH + LAST_CPUPID_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS
|
||||
#define LAST_CPUPID_WIDTH LAST_CPUPID_SHIFT
|
||||
#else
|
||||
#define LAST_CPUPID_WIDTH 0
|
||||
@@ -100,10 +101,12 @@
|
||||
#define LAST_CPUPID_NOT_IN_PAGE_FLAGS
|
||||
#endif
|
||||
|
||||
-#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + KASAN_TAG_WIDTH + LAST_CPUPID_WIDTH \
|
||||
- > BITS_PER_LONG - NR_PAGEFLAGS
|
||||
+#if ZONES_WIDTH + LRU_GEN_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + \
|
||||
+ KASAN_TAG_WIDTH + LAST_CPUPID_WIDTH > BITS_PER_LONG - NR_PAGEFLAGS
|
||||
#error "Not enough bits in page flags"
|
||||
#endif
|
||||
|
||||
+#define LRU_REFS_WIDTH 0
|
||||
+
|
||||
#endif
|
||||
#endif /* _LINUX_PAGE_FLAGS_LAYOUT */
|
||||
--- a/include/linux/page-flags.h
|
||||
+++ b/include/linux/page-flags.h
|
||||
@@ -845,7 +845,7 @@ static inline void ClearPageSlabPfmemall
|
||||
1UL << PG_private | 1UL << PG_private_2 | \
|
||||
1UL << PG_writeback | 1UL << PG_reserved | \
|
||||
1UL << PG_slab | 1UL << PG_active | \
|
||||
- 1UL << PG_unevictable | __PG_MLOCKED)
|
||||
+ 1UL << PG_unevictable | __PG_MLOCKED | LRU_GEN_MASK)
|
||||
|
||||
/*
|
||||
* Flags checked when a page is prepped for return by the page allocator.
|
||||
@@ -856,7 +856,7 @@ static inline void ClearPageSlabPfmemall
|
||||
* alloc-free cycle to prevent from reusing the page.
|
||||
*/
|
||||
#define PAGE_FLAGS_CHECK_AT_PREP \
|
||||
- (PAGEFLAGS_MASK & ~__PG_HWPOISON)
|
||||
+ ((PAGEFLAGS_MASK & ~__PG_HWPOISON) | LRU_GEN_MASK | LRU_REFS_MASK)
|
||||
|
||||
#define PAGE_FLAGS_PRIVATE \
|
||||
(1UL << PG_private | 1UL << PG_private_2)
|
||||
--- a/include/linux/sched.h
|
||||
+++ b/include/linux/sched.h
|
||||
@@ -911,6 +911,10 @@ struct task_struct {
|
||||
#ifdef CONFIG_MEMCG
|
||||
unsigned in_user_fault:1;
|
||||
#endif
|
||||
+#ifdef CONFIG_LRU_GEN
|
||||
+ /* whether the LRU algorithm may apply to this access */
|
||||
+ unsigned in_lru_fault:1;
|
||||
+#endif
|
||||
#ifdef CONFIG_COMPAT_BRK
|
||||
unsigned brk_randomized:1;
|
||||
#endif
|
||||
--- a/kernel/bounds.c
|
||||
+++ b/kernel/bounds.c
|
||||
@@ -22,6 +22,11 @@ int main(void)
|
||||
DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS));
|
||||
#endif
|
||||
DEFINE(SPINLOCK_SIZE, sizeof(spinlock_t));
|
||||
+#ifdef CONFIG_LRU_GEN
|
||||
+ DEFINE(LRU_GEN_WIDTH, order_base_2(MAX_NR_GENS + 1));
|
||||
+#else
|
||||
+ DEFINE(LRU_GEN_WIDTH, 0);
|
||||
+#endif
|
||||
/* End of constants */
|
||||
|
||||
return 0;
|
||||
--- a/mm/Kconfig
|
||||
+++ b/mm/Kconfig
|
||||
@@ -897,6 +897,14 @@ config IO_MAPPING
|
||||
config SECRETMEM
|
||||
def_bool ARCH_HAS_SET_DIRECT_MAP && !EMBEDDED
|
||||
|
||||
+config LRU_GEN
|
||||
+ bool "Multi-Gen LRU"
|
||||
+ depends on MMU
|
||||
+ # make sure page->flags has enough spare bits
|
||||
+ depends on 64BIT || !SPARSEMEM || SPARSEMEM_VMEMMAP
|
||||
+ help
|
||||
+ A high performance LRU implementation to overcommit memory.
|
||||
+
|
||||
source "mm/damon/Kconfig"
|
||||
|
||||
endmenu
|
||||
--- a/mm/huge_memory.c
|
||||
+++ b/mm/huge_memory.c
|
||||
@@ -2366,7 +2366,8 @@ static void __split_huge_page_tail(struc
|
||||
#ifdef CONFIG_64BIT
|
||||
(1L << PG_arch_2) |
|
||||
#endif
|
||||
- (1L << PG_dirty)));
|
||||
+ (1L << PG_dirty) |
|
||||
+ LRU_GEN_MASK | LRU_REFS_MASK));
|
||||
|
||||
/* ->mapping in first tail page is compound_mapcount */
|
||||
VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
|
||||
--- a/mm/memcontrol.c
|
||||
+++ b/mm/memcontrol.c
|
||||
@@ -5178,6 +5178,7 @@ static void __mem_cgroup_free(struct mem
|
||||
|
||||
static void mem_cgroup_free(struct mem_cgroup *memcg)
|
||||
{
|
||||
+ lru_gen_exit_memcg(memcg);
|
||||
memcg_wb_domain_exit(memcg);
|
||||
__mem_cgroup_free(memcg);
|
||||
}
|
||||
@@ -5241,6 +5242,7 @@ static struct mem_cgroup *mem_cgroup_all
|
||||
memcg->deferred_split_queue.split_queue_len = 0;
|
||||
#endif
|
||||
idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
|
||||
+ lru_gen_init_memcg(memcg);
|
||||
return memcg;
|
||||
fail:
|
||||
mem_cgroup_id_remove(memcg);
|
||||
--- a/mm/memory.c
|
||||
+++ b/mm/memory.c
|
||||
@@ -4792,6 +4792,27 @@ static inline void mm_account_fault(stru
|
||||
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address);
|
||||
}
|
||||
|
||||
+#ifdef CONFIG_LRU_GEN
|
||||
+static void lru_gen_enter_fault(struct vm_area_struct *vma)
|
||||
+{
|
||||
+ /* the LRU algorithm doesn't apply to sequential or random reads */
|
||||
+ current->in_lru_fault = !(vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ));
|
||||
+}
|
||||
+
|
||||
+static void lru_gen_exit_fault(void)
|
||||
+{
|
||||
+ current->in_lru_fault = false;
|
||||
+}
|
||||
+#else
|
||||
+static void lru_gen_enter_fault(struct vm_area_struct *vma)
|
||||
+{
|
||||
+}
|
||||
+
|
||||
+static void lru_gen_exit_fault(void)
|
||||
+{
|
||||
+}
|
||||
+#endif /* CONFIG_LRU_GEN */
|
||||
+
|
||||
/*
|
||||
* By the time we get here, we already hold the mm semaphore
|
||||
*
|
||||
@@ -4823,11 +4844,15 @@ vm_fault_t handle_mm_fault(struct vm_are
|
||||
if (flags & FAULT_FLAG_USER)
|
||||
mem_cgroup_enter_user_fault();
|
||||
|
||||
+ lru_gen_enter_fault(vma);
|
||||
+
|
||||
if (unlikely(is_vm_hugetlb_page(vma)))
|
||||
ret = hugetlb_fault(vma->vm_mm, vma, address, flags);
|
||||
else
|
||||
ret = __handle_mm_fault(vma, address, flags);
|
||||
|
||||
+ lru_gen_exit_fault();
|
||||
+
|
||||
if (flags & FAULT_FLAG_USER) {
|
||||
mem_cgroup_exit_user_fault();
|
||||
/*
|
||||
--- a/mm/mm_init.c
|
||||
+++ b/mm/mm_init.c
|
||||
@@ -65,14 +65,16 @@ void __init mminit_verify_pageflags_layo
|
||||
|
||||
shift = 8 * sizeof(unsigned long);
|
||||
width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH
|
||||
- - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH;
|
||||
+ - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH - LRU_GEN_WIDTH - LRU_REFS_WIDTH;
|
||||
mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths",
|
||||
- "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Flags %d\n",
|
||||
+ "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Gen %d Tier %d Flags %d\n",
|
||||
SECTIONS_WIDTH,
|
||||
NODES_WIDTH,
|
||||
ZONES_WIDTH,
|
||||
LAST_CPUPID_WIDTH,
|
||||
KASAN_TAG_WIDTH,
|
||||
+ LRU_GEN_WIDTH,
|
||||
+ LRU_REFS_WIDTH,
|
||||
NR_PAGEFLAGS);
|
||||
mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts",
|
||||
"Section %d Node %d Zone %d Lastcpupid %d Kasantag %d\n",
|
||||
--- a/mm/mmzone.c
|
||||
+++ b/mm/mmzone.c
|
||||
@@ -81,6 +81,8 @@ void lruvec_init(struct lruvec *lruvec)
|
||||
|
||||
for_each_lru(lru)
|
||||
INIT_LIST_HEAD(&lruvec->lists[lru]);
|
||||
+
|
||||
+ lru_gen_init_lruvec(lruvec);
|
||||
}
|
||||
|
||||
#if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS)
|
||||
--- a/mm/swap.c
|
||||
+++ b/mm/swap.c
|
||||
@@ -446,6 +446,11 @@ void lru_cache_add(struct page *page)
|
||||
VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page);
|
||||
VM_BUG_ON_PAGE(PageLRU(page), page);
|
||||
|
||||
+ /* see the comment in lru_gen_add_page() */
|
||||
+ if (lru_gen_enabled() && !PageUnevictable(page) &&
|
||||
+ lru_gen_in_fault() && !(current->flags & PF_MEMALLOC))
|
||||
+ SetPageActive(page);
|
||||
+
|
||||
get_page(page);
|
||||
local_lock(&lru_pvecs.lock);
|
||||
pvec = this_cpu_ptr(&lru_pvecs.lru_add);
|
||||
@@ -547,7 +552,7 @@ static void lru_deactivate_file_fn(struc
|
||||
|
||||
static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec)
|
||||
{
|
||||
- if (PageActive(page) && !PageUnevictable(page)) {
|
||||
+ if (!PageUnevictable(page) && (PageActive(page) || lru_gen_enabled())) {
|
||||
int nr_pages = thp_nr_pages(page);
|
||||
|
||||
del_page_from_lru_list(page, lruvec);
|
||||
@@ -661,7 +666,8 @@ void deactivate_file_page(struct page *p
|
||||
*/
|
||||
void deactivate_page(struct page *page)
|
||||
{
|
||||
- if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
|
||||
+ if (PageLRU(page) && !PageUnevictable(page) &&
|
||||
+ (PageActive(page) || lru_gen_enabled())) {
|
||||
struct pagevec *pvec;
|
||||
|
||||
local_lock(&lru_pvecs.lock);
|
||||
--- a/mm/vmscan.c
|
||||
+++ b/mm/vmscan.c
|
||||
@@ -2821,6 +2821,81 @@ static bool can_age_anon_pages(struct pg
|
||||
return can_demote(pgdat->node_id, sc);
|
||||
}
|
||||
|
||||
+#ifdef CONFIG_LRU_GEN
|
||||
+
|
||||
+/******************************************************************************
|
||||
+ * shorthand helpers
|
||||
+ ******************************************************************************/
|
||||
+
|
||||
+#define for_each_gen_type_zone(gen, type, zone) \
|
||||
+ for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++) \
|
||||
+ for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \
|
||||
+ for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++)
|
||||
+
|
||||
+static struct lruvec __maybe_unused *get_lruvec(struct mem_cgroup *memcg, int nid)
|
||||
+{
|
||||
+ struct pglist_data *pgdat = NODE_DATA(nid);
|
||||
+
|
||||
+#ifdef CONFIG_MEMCG
|
||||
+ if (memcg) {
|
||||
+ struct lruvec *lruvec = &memcg->nodeinfo[nid]->lruvec;
|
||||
+
|
||||
+ /* for hotadd_new_pgdat() */
|
||||
+ if (!lruvec->pgdat)
|
||||
+ lruvec->pgdat = pgdat;
|
||||
+
|
||||
+ return lruvec;
|
||||
+ }
|
||||
+#endif
|
||||
+ VM_WARN_ON_ONCE(!mem_cgroup_disabled());
|
||||
+
|
||||
+ return pgdat ? &pgdat->__lruvec : NULL;
|
||||
+}
|
||||
+
|
||||
+/******************************************************************************
|
||||
+ * initialization
|
||||
+ ******************************************************************************/
|
||||
+
|
||||
+void lru_gen_init_lruvec(struct lruvec *lruvec)
|
||||
+{
|
||||
+ int gen, type, zone;
|
||||
+ struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
||||
+
|
||||
+ lrugen->max_seq = MIN_NR_GENS + 1;
|
||||
+
|
||||
+ for_each_gen_type_zone(gen, type, zone)
|
||||
+ INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]);
|
||||
+}
|
||||
+
|
||||
+#ifdef CONFIG_MEMCG
|
||||
+void lru_gen_init_memcg(struct mem_cgroup *memcg)
|
||||
+{
|
||||
+}
|
||||
+
|
||||
+void lru_gen_exit_memcg(struct mem_cgroup *memcg)
|
||||
+{
|
||||
+ int nid;
|
||||
+
|
||||
+ for_each_node(nid) {
|
||||
+ struct lruvec *lruvec = get_lruvec(memcg, nid);
|
||||
+
|
||||
+ VM_WARN_ON_ONCE(memchr_inv(lruvec->lrugen.nr_pages, 0,
|
||||
+ sizeof(lruvec->lrugen.nr_pages)));
|
||||
+ }
|
||||
+}
|
||||
+#endif
|
||||
+
|
||||
+static int __init init_lru_gen(void)
|
||||
+{
|
||||
+ BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS);
|
||||
+ BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS);
|
||||
+
|
||||
+ return 0;
|
||||
+};
|
||||
+late_initcall(init_lru_gen);
|
||||
+
|
||||
+#endif /* CONFIG_LRU_GEN */
|
||||
+
|
||||
static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
|
||||
{
|
||||
unsigned long nr[NR_LRU_LISTS];
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,491 @@
|
||||
From e4277535f6d6708bb19b88c4bad155832671d69b Mon Sep 17 00:00:00 2001
|
||||
From: Yu Zhao <yuzhao@google.com>
|
||||
Date: Sun, 18 Sep 2022 02:00:04 -0600
|
||||
Subject: [PATCH 07/29] mm: multi-gen LRU: exploit locality in rmap
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
Searching the rmap for PTEs mapping each page on an LRU list (to test and
|
||||
clear the accessed bit) can be expensive because pages from different VMAs
|
||||
(PA space) are not cache friendly to the rmap (VA space). For workloads
|
||||
mostly using mapped pages, searching the rmap can incur the highest CPU
|
||||
cost in the reclaim path.
|
||||
|
||||
This patch exploits spatial locality to reduce the trips into the rmap.
|
||||
When shrink_page_list() walks the rmap and finds a young PTE, a new
|
||||
function lru_gen_look_around() scans at most BITS_PER_LONG-1 adjacent
|
||||
PTEs. On finding another young PTE, it clears the accessed bit and
|
||||
updates the gen counter of the page mapped by this PTE to
|
||||
(max_seq%MAX_NR_GENS)+1.
|
||||
|
||||
Server benchmark results:
|
||||
Single workload:
|
||||
fio (buffered I/O): no change
|
||||
|
||||
Single workload:
|
||||
memcached (anon): +[3, 5]%
|
||||
Ops/sec KB/sec
|
||||
patch1-6: 1106168.46 43025.04
|
||||
patch1-7: 1147696.57 44640.29
|
||||
|
||||
Configurations:
|
||||
no change
|
||||
|
||||
Client benchmark results:
|
||||
kswapd profiles:
|
||||
patch1-6
|
||||
39.03% lzo1x_1_do_compress (real work)
|
||||
18.47% page_vma_mapped_walk (overhead)
|
||||
6.74% _raw_spin_unlock_irq
|
||||
3.97% do_raw_spin_lock
|
||||
2.49% ptep_clear_flush
|
||||
2.48% anon_vma_interval_tree_iter_first
|
||||
1.92% page_referenced_one
|
||||
1.88% __zram_bvec_write
|
||||
1.48% memmove
|
||||
1.31% vma_interval_tree_iter_next
|
||||
|
||||
patch1-7
|
||||
48.16% lzo1x_1_do_compress (real work)
|
||||
8.20% page_vma_mapped_walk (overhead)
|
||||
7.06% _raw_spin_unlock_irq
|
||||
2.92% ptep_clear_flush
|
||||
2.53% __zram_bvec_write
|
||||
2.11% do_raw_spin_lock
|
||||
2.02% memmove
|
||||
1.93% lru_gen_look_around
|
||||
1.56% free_unref_page_list
|
||||
1.40% memset
|
||||
|
||||
Configurations:
|
||||
no change
|
||||
|
||||
Link: https://lkml.kernel.org/r/20220918080010.2920238-8-yuzhao@google.com
|
||||
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
||||
Acked-by: Barry Song <baohua@kernel.org>
|
||||
Acked-by: Brian Geffon <bgeffon@google.com>
|
||||
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
|
||||
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
|
||||
Acked-by: Steven Barrett <steven@liquorix.net>
|
||||
Acked-by: Suleiman Souhlal <suleiman@google.com>
|
||||
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
|
||||
Tested-by: Donald Carr <d@chaos-reins.com>
|
||||
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
|
||||
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
|
||||
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
|
||||
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
|
||||
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
|
||||
Cc: Andi Kleen <ak@linux.intel.com>
|
||||
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
|
||||
Cc: Catalin Marinas <catalin.marinas@arm.com>
|
||||
Cc: Dave Hansen <dave.hansen@linux.intel.com>
|
||||
Cc: Hillf Danton <hdanton@sina.com>
|
||||
Cc: Jens Axboe <axboe@kernel.dk>
|
||||
Cc: Johannes Weiner <hannes@cmpxchg.org>
|
||||
Cc: Jonathan Corbet <corbet@lwn.net>
|
||||
Cc: Linus Torvalds <torvalds@linux-foundation.org>
|
||||
Cc: Matthew Wilcox <willy@infradead.org>
|
||||
Cc: Mel Gorman <mgorman@suse.de>
|
||||
Cc: Miaohe Lin <linmiaohe@huawei.com>
|
||||
Cc: Michael Larabel <Michael@MichaelLarabel.com>
|
||||
Cc: Michal Hocko <mhocko@kernel.org>
|
||||
Cc: Mike Rapoport <rppt@kernel.org>
|
||||
Cc: Mike Rapoport <rppt@linux.ibm.com>
|
||||
Cc: Peter Zijlstra <peterz@infradead.org>
|
||||
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
|
||||
Cc: Tejun Heo <tj@kernel.org>
|
||||
Cc: Vlastimil Babka <vbabka@suse.cz>
|
||||
Cc: Will Deacon <will@kernel.org>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
include/linux/memcontrol.h | 31 +++++++
|
||||
include/linux/mmzone.h | 6 ++
|
||||
mm/internal.h | 1 +
|
||||
mm/memcontrol.c | 1 +
|
||||
mm/rmap.c | 7 ++
|
||||
mm/swap.c | 4 +-
|
||||
mm/vmscan.c | 184 +++++++++++++++++++++++++++++++++++++
|
||||
7 files changed, 232 insertions(+), 2 deletions(-)
|
||||
|
||||
--- a/include/linux/memcontrol.h
|
||||
+++ b/include/linux/memcontrol.h
|
||||
@@ -442,6 +442,7 @@ static inline struct obj_cgroup *__page_
|
||||
* - LRU isolation
|
||||
* - lock_page_memcg()
|
||||
* - exclusive reference
|
||||
+ * - mem_cgroup_trylock_pages()
|
||||
*
|
||||
* For a kmem page a caller should hold an rcu read lock to protect memcg
|
||||
* associated with a kmem page from being released.
|
||||
@@ -497,6 +498,7 @@ static inline struct mem_cgroup *page_me
|
||||
* - LRU isolation
|
||||
* - lock_page_memcg()
|
||||
* - exclusive reference
|
||||
+ * - mem_cgroup_trylock_pages()
|
||||
*
|
||||
* For a kmem page a caller should hold an rcu read lock to protect memcg
|
||||
* associated with a kmem page from being released.
|
||||
@@ -953,6 +955,23 @@ void unlock_page_memcg(struct page *page
|
||||
|
||||
void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val);
|
||||
|
||||
+/* try to stablize page_memcg() for all the pages in a memcg */
|
||||
+static inline bool mem_cgroup_trylock_pages(struct mem_cgroup *memcg)
|
||||
+{
|
||||
+ rcu_read_lock();
|
||||
+
|
||||
+ if (mem_cgroup_disabled() || !atomic_read(&memcg->moving_account))
|
||||
+ return true;
|
||||
+
|
||||
+ rcu_read_unlock();
|
||||
+ return false;
|
||||
+}
|
||||
+
|
||||
+static inline void mem_cgroup_unlock_pages(void)
|
||||
+{
|
||||
+ rcu_read_unlock();
|
||||
+}
|
||||
+
|
||||
/* idx can be of type enum memcg_stat_item or node_stat_item */
|
||||
static inline void mod_memcg_state(struct mem_cgroup *memcg,
|
||||
int idx, int val)
|
||||
@@ -1369,6 +1388,18 @@ static inline void unlock_page_memcg(str
|
||||
{
|
||||
}
|
||||
|
||||
+static inline bool mem_cgroup_trylock_pages(struct mem_cgroup *memcg)
|
||||
+{
|
||||
+ /* to match page_memcg_rcu() */
|
||||
+ rcu_read_lock();
|
||||
+ return true;
|
||||
+}
|
||||
+
|
||||
+static inline void mem_cgroup_unlock_pages(void)
|
||||
+{
|
||||
+ rcu_read_unlock();
|
||||
+}
|
||||
+
|
||||
static inline void mem_cgroup_handle_over_high(void)
|
||||
{
|
||||
}
|
||||
--- a/include/linux/mmzone.h
|
||||
+++ b/include/linux/mmzone.h
|
||||
@@ -352,6 +352,7 @@ enum lruvec_flags {
|
||||
#ifndef __GENERATING_BOUNDS_H
|
||||
|
||||
struct lruvec;
|
||||
+struct page_vma_mapped_walk;
|
||||
|
||||
#define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF)
|
||||
#define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF)
|
||||
@@ -407,6 +408,7 @@ struct lru_gen_struct {
|
||||
};
|
||||
|
||||
void lru_gen_init_lruvec(struct lruvec *lruvec);
|
||||
+void lru_gen_look_around(struct page_vma_mapped_walk *pvmw);
|
||||
|
||||
#ifdef CONFIG_MEMCG
|
||||
void lru_gen_init_memcg(struct mem_cgroup *memcg);
|
||||
@@ -419,6 +421,10 @@ static inline void lru_gen_init_lruvec(s
|
||||
{
|
||||
}
|
||||
|
||||
+static inline void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
|
||||
+{
|
||||
+}
|
||||
+
|
||||
#ifdef CONFIG_MEMCG
|
||||
static inline void lru_gen_init_memcg(struct mem_cgroup *memcg)
|
||||
{
|
||||
--- a/mm/internal.h
|
||||
+++ b/mm/internal.h
|
||||
@@ -35,6 +35,7 @@
|
||||
void page_writeback_init(void);
|
||||
|
||||
vm_fault_t do_swap_page(struct vm_fault *vmf);
|
||||
+void activate_page(struct page *page);
|
||||
|
||||
void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
|
||||
unsigned long floor, unsigned long ceiling);
|
||||
--- a/mm/memcontrol.c
|
||||
+++ b/mm/memcontrol.c
|
||||
@@ -2798,6 +2798,7 @@ static void commit_charge(struct page *p
|
||||
* - LRU isolation
|
||||
* - lock_page_memcg()
|
||||
* - exclusive reference
|
||||
+ * - mem_cgroup_trylock_pages()
|
||||
*/
|
||||
page->memcg_data = (unsigned long)memcg;
|
||||
}
|
||||
--- a/mm/rmap.c
|
||||
+++ b/mm/rmap.c
|
||||
@@ -73,6 +73,7 @@
|
||||
#include <linux/page_idle.h>
|
||||
#include <linux/memremap.h>
|
||||
#include <linux/userfaultfd_k.h>
|
||||
+#include <linux/mm_inline.h>
|
||||
|
||||
#include <asm/tlbflush.h>
|
||||
|
||||
@@ -793,6 +794,12 @@ static bool page_referenced_one(struct p
|
||||
}
|
||||
|
||||
if (pvmw.pte) {
|
||||
+ if (lru_gen_enabled() && pte_young(*pvmw.pte) &&
|
||||
+ !(vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ))) {
|
||||
+ lru_gen_look_around(&pvmw);
|
||||
+ referenced++;
|
||||
+ }
|
||||
+
|
||||
if (ptep_clear_flush_young_notify(vma, address,
|
||||
pvmw.pte)) {
|
||||
/*
|
||||
--- a/mm/swap.c
|
||||
+++ b/mm/swap.c
|
||||
@@ -325,7 +325,7 @@ static bool need_activate_page_drain(int
|
||||
return pagevec_count(&per_cpu(lru_pvecs.activate_page, cpu)) != 0;
|
||||
}
|
||||
|
||||
-static void activate_page(struct page *page)
|
||||
+void activate_page(struct page *page)
|
||||
{
|
||||
page = compound_head(page);
|
||||
if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
|
||||
@@ -345,7 +345,7 @@ static inline void activate_page_drain(i
|
||||
{
|
||||
}
|
||||
|
||||
-static void activate_page(struct page *page)
|
||||
+void activate_page(struct page *page)
|
||||
{
|
||||
struct lruvec *lruvec;
|
||||
|
||||
--- a/mm/vmscan.c
|
||||
+++ b/mm/vmscan.c
|
||||
@@ -1409,6 +1409,11 @@ retry:
|
||||
if (!sc->may_unmap && page_mapped(page))
|
||||
goto keep_locked;
|
||||
|
||||
+ /* page_update_gen() tried to promote this page? */
|
||||
+ if (lru_gen_enabled() && !ignore_references &&
|
||||
+ page_mapped(page) && PageReferenced(page))
|
||||
+ goto keep_locked;
|
||||
+
|
||||
may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
|
||||
(PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
|
||||
|
||||
@@ -2990,6 +2995,29 @@ static bool positive_ctrl_err(struct ctr
|
||||
* the aging
|
||||
******************************************************************************/
|
||||
|
||||
+/* promote pages accessed through page tables */
|
||||
+static int page_update_gen(struct page *page, int gen)
|
||||
+{
|
||||
+ unsigned long new_flags, old_flags = READ_ONCE(page->flags);
|
||||
+
|
||||
+ VM_WARN_ON_ONCE(gen >= MAX_NR_GENS);
|
||||
+ VM_WARN_ON_ONCE(!rcu_read_lock_held());
|
||||
+
|
||||
+ do {
|
||||
+ /* lru_gen_del_page() has isolated this page? */
|
||||
+ if (!(old_flags & LRU_GEN_MASK)) {
|
||||
+ /* for shrink_page_list() */
|
||||
+ new_flags = old_flags | BIT(PG_referenced);
|
||||
+ continue;
|
||||
+ }
|
||||
+
|
||||
+ new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS);
|
||||
+ new_flags |= (gen + 1UL) << LRU_GEN_PGOFF;
|
||||
+ } while (!try_cmpxchg(&page->flags, &old_flags, new_flags));
|
||||
+
|
||||
+ return ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
|
||||
+}
|
||||
+
|
||||
/* protect pages accessed multiple times through file descriptors */
|
||||
static int page_inc_gen(struct lruvec *lruvec, struct page *page, bool reclaiming)
|
||||
{
|
||||
@@ -3001,6 +3029,11 @@ static int page_inc_gen(struct lruvec *l
|
||||
VM_WARN_ON_ONCE_PAGE(!(old_flags & LRU_GEN_MASK), page);
|
||||
|
||||
do {
|
||||
+ new_gen = ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
|
||||
+ /* page_update_gen() has promoted this page? */
|
||||
+ if (new_gen >= 0 && new_gen != old_gen)
|
||||
+ return new_gen;
|
||||
+
|
||||
new_gen = (old_gen + 1) % MAX_NR_GENS;
|
||||
|
||||
new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS);
|
||||
@@ -3015,6 +3048,43 @@ static int page_inc_gen(struct lruvec *l
|
||||
return new_gen;
|
||||
}
|
||||
|
||||
+static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned long addr)
|
||||
+{
|
||||
+ unsigned long pfn = pte_pfn(pte);
|
||||
+
|
||||
+ VM_WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end);
|
||||
+
|
||||
+ if (!pte_present(pte) || is_zero_pfn(pfn))
|
||||
+ return -1;
|
||||
+
|
||||
+ if (WARN_ON_ONCE(pte_devmap(pte) || pte_special(pte)))
|
||||
+ return -1;
|
||||
+
|
||||
+ if (WARN_ON_ONCE(!pfn_valid(pfn)))
|
||||
+ return -1;
|
||||
+
|
||||
+ return pfn;
|
||||
+}
|
||||
+
|
||||
+static struct page *get_pfn_page(unsigned long pfn, struct mem_cgroup *memcg,
|
||||
+ struct pglist_data *pgdat)
|
||||
+{
|
||||
+ struct page *page;
|
||||
+
|
||||
+ /* try to avoid unnecessary memory loads */
|
||||
+ if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat))
|
||||
+ return NULL;
|
||||
+
|
||||
+ page = compound_head(pfn_to_page(pfn));
|
||||
+ if (page_to_nid(page) != pgdat->node_id)
|
||||
+ return NULL;
|
||||
+
|
||||
+ if (page_memcg_rcu(page) != memcg)
|
||||
+ return NULL;
|
||||
+
|
||||
+ return page;
|
||||
+}
|
||||
+
|
||||
static void inc_min_seq(struct lruvec *lruvec, int type)
|
||||
{
|
||||
struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
||||
@@ -3214,6 +3284,114 @@ static void lru_gen_age_node(struct pgli
|
||||
} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
|
||||
}
|
||||
|
||||
+/*
|
||||
+ * This function exploits spatial locality when shrink_page_list() walks the
|
||||
+ * rmap. It scans the adjacent PTEs of a young PTE and promotes hot pages.
|
||||
+ */
|
||||
+void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
|
||||
+{
|
||||
+ int i;
|
||||
+ pte_t *pte;
|
||||
+ unsigned long start;
|
||||
+ unsigned long end;
|
||||
+ unsigned long addr;
|
||||
+ unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)] = {};
|
||||
+ struct page *page = pvmw->page;
|
||||
+ struct mem_cgroup *memcg = page_memcg(page);
|
||||
+ struct pglist_data *pgdat = page_pgdat(page);
|
||||
+ struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
|
||||
+ DEFINE_MAX_SEQ(lruvec);
|
||||
+ int old_gen, new_gen = lru_gen_from_seq(max_seq);
|
||||
+
|
||||
+ lockdep_assert_held(pvmw->ptl);
|
||||
+ VM_WARN_ON_ONCE_PAGE(PageLRU(page), page);
|
||||
+
|
||||
+ if (spin_is_contended(pvmw->ptl))
|
||||
+ return;
|
||||
+
|
||||
+ start = max(pvmw->address & PMD_MASK, pvmw->vma->vm_start);
|
||||
+ end = min(pvmw->address | ~PMD_MASK, pvmw->vma->vm_end - 1) + 1;
|
||||
+
|
||||
+ if (end - start > MIN_LRU_BATCH * PAGE_SIZE) {
|
||||
+ if (pvmw->address - start < MIN_LRU_BATCH * PAGE_SIZE / 2)
|
||||
+ end = start + MIN_LRU_BATCH * PAGE_SIZE;
|
||||
+ else if (end - pvmw->address < MIN_LRU_BATCH * PAGE_SIZE / 2)
|
||||
+ start = end - MIN_LRU_BATCH * PAGE_SIZE;
|
||||
+ else {
|
||||
+ start = pvmw->address - MIN_LRU_BATCH * PAGE_SIZE / 2;
|
||||
+ end = pvmw->address + MIN_LRU_BATCH * PAGE_SIZE / 2;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ pte = pvmw->pte - (pvmw->address - start) / PAGE_SIZE;
|
||||
+
|
||||
+ rcu_read_lock();
|
||||
+ arch_enter_lazy_mmu_mode();
|
||||
+
|
||||
+ for (i = 0, addr = start; addr != end; i++, addr += PAGE_SIZE) {
|
||||
+ unsigned long pfn;
|
||||
+
|
||||
+ pfn = get_pte_pfn(pte[i], pvmw->vma, addr);
|
||||
+ if (pfn == -1)
|
||||
+ continue;
|
||||
+
|
||||
+ if (!pte_young(pte[i]))
|
||||
+ continue;
|
||||
+
|
||||
+ page = get_pfn_page(pfn, memcg, pgdat);
|
||||
+ if (!page)
|
||||
+ continue;
|
||||
+
|
||||
+ if (!ptep_test_and_clear_young(pvmw->vma, addr, pte + i))
|
||||
+ VM_WARN_ON_ONCE(true);
|
||||
+
|
||||
+ if (pte_dirty(pte[i]) && !PageDirty(page) &&
|
||||
+ !(PageAnon(page) && PageSwapBacked(page) &&
|
||||
+ !PageSwapCache(page)))
|
||||
+ set_page_dirty(page);
|
||||
+
|
||||
+ old_gen = page_lru_gen(page);
|
||||
+ if (old_gen < 0)
|
||||
+ SetPageReferenced(page);
|
||||
+ else if (old_gen != new_gen)
|
||||
+ __set_bit(i, bitmap);
|
||||
+ }
|
||||
+
|
||||
+ arch_leave_lazy_mmu_mode();
|
||||
+ rcu_read_unlock();
|
||||
+
|
||||
+ if (bitmap_weight(bitmap, MIN_LRU_BATCH) < PAGEVEC_SIZE) {
|
||||
+ for_each_set_bit(i, bitmap, MIN_LRU_BATCH) {
|
||||
+ page = pte_page(pte[i]);
|
||||
+ activate_page(page);
|
||||
+ }
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
+ /* page_update_gen() requires stable page_memcg() */
|
||||
+ if (!mem_cgroup_trylock_pages(memcg))
|
||||
+ return;
|
||||
+
|
||||
+ spin_lock_irq(&lruvec->lru_lock);
|
||||
+ new_gen = lru_gen_from_seq(lruvec->lrugen.max_seq);
|
||||
+
|
||||
+ for_each_set_bit(i, bitmap, MIN_LRU_BATCH) {
|
||||
+ page = compound_head(pte_page(pte[i]));
|
||||
+ if (page_memcg_rcu(page) != memcg)
|
||||
+ continue;
|
||||
+
|
||||
+ old_gen = page_update_gen(page, new_gen);
|
||||
+ if (old_gen < 0 || old_gen == new_gen)
|
||||
+ continue;
|
||||
+
|
||||
+ lru_gen_update_size(lruvec, page, old_gen, new_gen);
|
||||
+ }
|
||||
+
|
||||
+ spin_unlock_irq(&lruvec->lru_lock);
|
||||
+
|
||||
+ mem_cgroup_unlock_pages();
|
||||
+}
|
||||
+
|
||||
/******************************************************************************
|
||||
* the eviction
|
||||
******************************************************************************/
|
||||
@@ -3250,6 +3428,12 @@ static bool sort_page(struct lruvec *lru
|
||||
return true;
|
||||
}
|
||||
|
||||
+ /* promoted */
|
||||
+ if (gen != lru_gen_from_seq(lrugen->min_seq[type])) {
|
||||
+ list_move(&page->lru, &lrugen->lists[gen][type][zone]);
|
||||
+ return true;
|
||||
+ }
|
||||
+
|
||||
/* protected */
|
||||
if (tier > tier_idx) {
|
||||
int hist = lru_hist_from_seq(lrugen->min_seq[type]);
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,315 @@
|
||||
From 36a18a68ea458e8f4db2ca86b00091daf32c6c74 Mon Sep 17 00:00:00 2001
|
||||
From: Yu Zhao <yuzhao@google.com>
|
||||
Date: Sun, 18 Sep 2022 02:00:06 -0600
|
||||
Subject: [PATCH 09/29] mm: multi-gen LRU: optimize multiple memcgs
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
When multiple memcgs are available, it is possible to use generations as a
|
||||
frame of reference to make better choices and improve overall performance
|
||||
under global memory pressure. This patch adds a basic optimization to
|
||||
select memcgs that can drop single-use unmapped clean pages first. Doing
|
||||
so reduces the chance of going into the aging path or swapping, which can
|
||||
be costly.
|
||||
|
||||
A typical example that benefits from this optimization is a server running
|
||||
mixed types of workloads, e.g., heavy anon workload in one memcg and heavy
|
||||
buffered I/O workload in the other.
|
||||
|
||||
Though this optimization can be applied to both kswapd and direct reclaim,
|
||||
it is only added to kswapd to keep the patchset manageable. Later
|
||||
improvements may cover the direct reclaim path.
|
||||
|
||||
While ensuring certain fairness to all eligible memcgs, proportional scans
|
||||
of individual memcgs also require proper backoff to avoid overshooting
|
||||
their aggregate reclaim target by too much. Otherwise it can cause high
|
||||
direct reclaim latency. The conditions for backoff are:
|
||||
|
||||
1. At low priorities, for direct reclaim, if aging fairness or direct
|
||||
reclaim latency is at risk, i.e., aging one memcg multiple times or
|
||||
swapping after the target is met.
|
||||
2. At high priorities, for global reclaim, if per-zone free pages are
|
||||
above respective watermarks.
|
||||
|
||||
Server benchmark results:
|
||||
Mixed workloads:
|
||||
fio (buffered I/O): +[19, 21]%
|
||||
IOPS BW
|
||||
patch1-8: 1880k 7343MiB/s
|
||||
patch1-9: 2252k 8796MiB/s
|
||||
|
||||
memcached (anon): +[119, 123]%
|
||||
Ops/sec KB/sec
|
||||
patch1-8: 862768.65 33514.68
|
||||
patch1-9: 1911022.12 74234.54
|
||||
|
||||
Mixed workloads:
|
||||
fio (buffered I/O): +[75, 77]%
|
||||
IOPS BW
|
||||
5.19-rc1: 1279k 4996MiB/s
|
||||
patch1-9: 2252k 8796MiB/s
|
||||
|
||||
memcached (anon): +[13, 15]%
|
||||
Ops/sec KB/sec
|
||||
5.19-rc1: 1673524.04 65008.87
|
||||
patch1-9: 1911022.12 74234.54
|
||||
|
||||
Configurations:
|
||||
(changes since patch 6)
|
||||
|
||||
cat mixed.sh
|
||||
modprobe brd rd_nr=2 rd_size=56623104
|
||||
|
||||
swapoff -a
|
||||
mkswap /dev/ram0
|
||||
swapon /dev/ram0
|
||||
|
||||
mkfs.ext4 /dev/ram1
|
||||
mount -t ext4 /dev/ram1 /mnt
|
||||
|
||||
memtier_benchmark -S /var/run/memcached/memcached.sock \
|
||||
-P memcache_binary -n allkeys --key-minimum=1 \
|
||||
--key-maximum=50000000 --key-pattern=P:P -c 1 -t 36 \
|
||||
--ratio 1:0 --pipeline 8 -d 2000
|
||||
|
||||
fio -name=mglru --numjobs=36 --directory=/mnt --size=1408m \
|
||||
--buffered=1 --ioengine=io_uring --iodepth=128 \
|
||||
--iodepth_batch_submit=32 --iodepth_batch_complete=32 \
|
||||
--rw=randread --random_distribution=random --norandommap \
|
||||
--time_based --ramp_time=10m --runtime=90m --group_reporting &
|
||||
pid=$!
|
||||
|
||||
sleep 200
|
||||
|
||||
memtier_benchmark -S /var/run/memcached/memcached.sock \
|
||||
-P memcache_binary -n allkeys --key-minimum=1 \
|
||||
--key-maximum=50000000 --key-pattern=R:R -c 1 -t 36 \
|
||||
--ratio 0:1 --pipeline 8 --randomize --distinct-client-seed
|
||||
|
||||
kill -INT $pid
|
||||
wait
|
||||
|
||||
Client benchmark results:
|
||||
no change (CONFIG_MEMCG=n)
|
||||
|
||||
Link: https://lkml.kernel.org/r/20220918080010.2920238-10-yuzhao@google.com
|
||||
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
||||
Acked-by: Brian Geffon <bgeffon@google.com>
|
||||
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
|
||||
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
|
||||
Acked-by: Steven Barrett <steven@liquorix.net>
|
||||
Acked-by: Suleiman Souhlal <suleiman@google.com>
|
||||
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
|
||||
Tested-by: Donald Carr <d@chaos-reins.com>
|
||||
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
|
||||
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
|
||||
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
|
||||
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
|
||||
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
|
||||
Cc: Andi Kleen <ak@linux.intel.com>
|
||||
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
|
||||
Cc: Barry Song <baohua@kernel.org>
|
||||
Cc: Catalin Marinas <catalin.marinas@arm.com>
|
||||
Cc: Dave Hansen <dave.hansen@linux.intel.com>
|
||||
Cc: Hillf Danton <hdanton@sina.com>
|
||||
Cc: Jens Axboe <axboe@kernel.dk>
|
||||
Cc: Johannes Weiner <hannes@cmpxchg.org>
|
||||
Cc: Jonathan Corbet <corbet@lwn.net>
|
||||
Cc: Linus Torvalds <torvalds@linux-foundation.org>
|
||||
Cc: Matthew Wilcox <willy@infradead.org>
|
||||
Cc: Mel Gorman <mgorman@suse.de>
|
||||
Cc: Miaohe Lin <linmiaohe@huawei.com>
|
||||
Cc: Michael Larabel <Michael@MichaelLarabel.com>
|
||||
Cc: Michal Hocko <mhocko@kernel.org>
|
||||
Cc: Mike Rapoport <rppt@kernel.org>
|
||||
Cc: Mike Rapoport <rppt@linux.ibm.com>
|
||||
Cc: Peter Zijlstra <peterz@infradead.org>
|
||||
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
|
||||
Cc: Tejun Heo <tj@kernel.org>
|
||||
Cc: Vlastimil Babka <vbabka@suse.cz>
|
||||
Cc: Will Deacon <will@kernel.org>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
mm/vmscan.c | 105 +++++++++++++++++++++++++++++++++++++++++++++++-----
|
||||
1 file changed, 96 insertions(+), 9 deletions(-)
|
||||
|
||||
--- a/mm/vmscan.c
|
||||
+++ b/mm/vmscan.c
|
||||
@@ -127,6 +127,12 @@ struct scan_control {
|
||||
/* Always discard instead of demoting to lower tier memory */
|
||||
unsigned int no_demotion:1;
|
||||
|
||||
+#ifdef CONFIG_LRU_GEN
|
||||
+ /* help kswapd make better choices among multiple memcgs */
|
||||
+ unsigned int memcgs_need_aging:1;
|
||||
+ unsigned long last_reclaimed;
|
||||
+#endif
|
||||
+
|
||||
/* Allocation order */
|
||||
s8 order;
|
||||
|
||||
@@ -4202,6 +4208,19 @@ static void lru_gen_age_node(struct pgli
|
||||
|
||||
VM_WARN_ON_ONCE(!current_is_kswapd());
|
||||
|
||||
+ sc->last_reclaimed = sc->nr_reclaimed;
|
||||
+
|
||||
+ /*
|
||||
+ * To reduce the chance of going into the aging path, which can be
|
||||
+ * costly, optimistically skip it if the flag below was cleared in the
|
||||
+ * eviction path. This improves the overall performance when multiple
|
||||
+ * memcgs are available.
|
||||
+ */
|
||||
+ if (!sc->memcgs_need_aging) {
|
||||
+ sc->memcgs_need_aging = true;
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
set_mm_walk(pgdat);
|
||||
|
||||
memcg = mem_cgroup_iter(NULL, NULL, NULL);
|
||||
@@ -4613,7 +4632,8 @@ static int isolate_pages(struct lruvec *
|
||||
return scanned;
|
||||
}
|
||||
|
||||
-static int evict_pages(struct lruvec *lruvec, struct scan_control *sc, int swappiness)
|
||||
+static int evict_pages(struct lruvec *lruvec, struct scan_control *sc, int swappiness,
|
||||
+ bool *need_swapping)
|
||||
{
|
||||
int type;
|
||||
int scanned;
|
||||
@@ -4676,6 +4696,9 @@ static int evict_pages(struct lruvec *lr
|
||||
|
||||
sc->nr_reclaimed += reclaimed;
|
||||
|
||||
+ if (need_swapping && type == LRU_GEN_ANON)
|
||||
+ *need_swapping = true;
|
||||
+
|
||||
return scanned;
|
||||
}
|
||||
|
||||
@@ -4685,9 +4708,8 @@ static int evict_pages(struct lruvec *lr
|
||||
* reclaim.
|
||||
*/
|
||||
static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc,
|
||||
- bool can_swap)
|
||||
+ bool can_swap, bool *need_aging)
|
||||
{
|
||||
- bool need_aging;
|
||||
unsigned long nr_to_scan;
|
||||
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
|
||||
DEFINE_MAX_SEQ(lruvec);
|
||||
@@ -4697,8 +4719,8 @@ static unsigned long get_nr_to_scan(stru
|
||||
(mem_cgroup_below_low(memcg) && !sc->memcg_low_reclaim))
|
||||
return 0;
|
||||
|
||||
- need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, can_swap, &nr_to_scan);
|
||||
- if (!need_aging)
|
||||
+ *need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, can_swap, &nr_to_scan);
|
||||
+ if (!*need_aging)
|
||||
return nr_to_scan;
|
||||
|
||||
/* skip the aging path at the default priority */
|
||||
@@ -4715,10 +4737,68 @@ done:
|
||||
return min_seq[!can_swap] + MIN_NR_GENS <= max_seq ? nr_to_scan : 0;
|
||||
}
|
||||
|
||||
+static bool should_abort_scan(struct lruvec *lruvec, unsigned long seq,
|
||||
+ struct scan_control *sc, bool need_swapping)
|
||||
+{
|
||||
+ int i;
|
||||
+ DEFINE_MAX_SEQ(lruvec);
|
||||
+
|
||||
+ if (!current_is_kswapd()) {
|
||||
+ /* age each memcg once to ensure fairness */
|
||||
+ if (max_seq - seq > 1)
|
||||
+ return true;
|
||||
+
|
||||
+ /* over-swapping can increase allocation latency */
|
||||
+ if (sc->nr_reclaimed >= sc->nr_to_reclaim && need_swapping)
|
||||
+ return true;
|
||||
+
|
||||
+ /* give this thread a chance to exit and free its memory */
|
||||
+ if (fatal_signal_pending(current)) {
|
||||
+ sc->nr_reclaimed += MIN_LRU_BATCH;
|
||||
+ return true;
|
||||
+ }
|
||||
+
|
||||
+ if (cgroup_reclaim(sc))
|
||||
+ return false;
|
||||
+ } else if (sc->nr_reclaimed - sc->last_reclaimed < sc->nr_to_reclaim)
|
||||
+ return false;
|
||||
+
|
||||
+ /* keep scanning at low priorities to ensure fairness */
|
||||
+ if (sc->priority > DEF_PRIORITY - 2)
|
||||
+ return false;
|
||||
+
|
||||
+ /*
|
||||
+ * A minimum amount of work was done under global memory pressure. For
|
||||
+ * kswapd, it may be overshooting. For direct reclaim, the target isn't
|
||||
+ * met, and yet the allocation may still succeed, since kswapd may have
|
||||
+ * caught up. In either case, it's better to stop now, and restart if
|
||||
+ * necessary.
|
||||
+ */
|
||||
+ for (i = 0; i <= sc->reclaim_idx; i++) {
|
||||
+ unsigned long wmark;
|
||||
+ struct zone *zone = lruvec_pgdat(lruvec)->node_zones + i;
|
||||
+
|
||||
+ if (!managed_zone(zone))
|
||||
+ continue;
|
||||
+
|
||||
+ wmark = current_is_kswapd() ? high_wmark_pages(zone) : low_wmark_pages(zone);
|
||||
+ if (wmark > zone_page_state(zone, NR_FREE_PAGES))
|
||||
+ return false;
|
||||
+ }
|
||||
+
|
||||
+ sc->nr_reclaimed += MIN_LRU_BATCH;
|
||||
+
|
||||
+ return true;
|
||||
+}
|
||||
+
|
||||
static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
|
||||
{
|
||||
struct blk_plug plug;
|
||||
+ bool need_aging = false;
|
||||
+ bool need_swapping = false;
|
||||
unsigned long scanned = 0;
|
||||
+ unsigned long reclaimed = sc->nr_reclaimed;
|
||||
+ DEFINE_MAX_SEQ(lruvec);
|
||||
|
||||
lru_add_drain();
|
||||
|
||||
@@ -4738,21 +4818,28 @@ static void lru_gen_shrink_lruvec(struct
|
||||
else
|
||||
swappiness = 0;
|
||||
|
||||
- nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness);
|
||||
+ nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness, &need_aging);
|
||||
if (!nr_to_scan)
|
||||
- break;
|
||||
+ goto done;
|
||||
|
||||
- delta = evict_pages(lruvec, sc, swappiness);
|
||||
+ delta = evict_pages(lruvec, sc, swappiness, &need_swapping);
|
||||
if (!delta)
|
||||
- break;
|
||||
+ goto done;
|
||||
|
||||
scanned += delta;
|
||||
if (scanned >= nr_to_scan)
|
||||
break;
|
||||
|
||||
+ if (should_abort_scan(lruvec, max_seq, sc, need_swapping))
|
||||
+ break;
|
||||
+
|
||||
cond_resched();
|
||||
}
|
||||
|
||||
+ /* see the comment in lru_gen_age_node() */
|
||||
+ if (sc->nr_reclaimed - reclaimed >= MIN_LRU_BATCH && !need_aging)
|
||||
+ sc->memcgs_need_aging = false;
|
||||
+done:
|
||||
clear_mm_walk();
|
||||
|
||||
blk_finish_plug(&plug);
|
@ -0,0 +1,498 @@
|
||||
From 640db3a029dca909af47157ca18f52b29d34a1b9 Mon Sep 17 00:00:00 2001
|
||||
From: Yu Zhao <yuzhao@google.com>
|
||||
Date: Sun, 18 Sep 2022 02:00:07 -0600
|
||||
Subject: [PATCH 10/29] mm: multi-gen LRU: kill switch
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
Add /sys/kernel/mm/lru_gen/enabled as a kill switch. Components that
|
||||
can be disabled include:
|
||||
0x0001: the multi-gen LRU core
|
||||
0x0002: walking page table, when arch_has_hw_pte_young() returns
|
||||
true
|
||||
0x0004: clearing the accessed bit in non-leaf PMD entries, when
|
||||
CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG=y
|
||||
[yYnN]: apply to all the components above
|
||||
E.g.,
|
||||
echo y >/sys/kernel/mm/lru_gen/enabled
|
||||
cat /sys/kernel/mm/lru_gen/enabled
|
||||
0x0007
|
||||
echo 5 >/sys/kernel/mm/lru_gen/enabled
|
||||
cat /sys/kernel/mm/lru_gen/enabled
|
||||
0x0005
|
||||
|
||||
NB: the page table walks happen on the scale of seconds under heavy memory
|
||||
pressure, in which case the mmap_lock contention is a lesser concern,
|
||||
compared with the LRU lock contention and the I/O congestion. So far the
|
||||
only well-known case of the mmap_lock contention happens on Android, due
|
||||
to Scudo [1] which allocates several thousand VMAs for merely a few
|
||||
hundred MBs. The SPF and the Maple Tree also have provided their own
|
||||
assessments [2][3]. However, if walking page tables does worsen the
|
||||
mmap_lock contention, the kill switch can be used to disable it. In this
|
||||
case the multi-gen LRU will suffer a minor performance degradation, as
|
||||
shown previously.
|
||||
|
||||
Clearing the accessed bit in non-leaf PMD entries can also be disabled,
|
||||
since this behavior was not tested on x86 varieties other than Intel and
|
||||
AMD.
|
||||
|
||||
[1] https://source.android.com/devices/tech/debug/scudo
|
||||
[2] https://lore.kernel.org/r/20220128131006.67712-1-michel@lespinasse.org/
|
||||
[3] https://lore.kernel.org/r/20220426150616.3937571-1-Liam.Howlett@oracle.com/
|
||||
|
||||
Link: https://lkml.kernel.org/r/20220918080010.2920238-11-yuzhao@google.com
|
||||
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
||||
Acked-by: Brian Geffon <bgeffon@google.com>
|
||||
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
|
||||
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
|
||||
Acked-by: Steven Barrett <steven@liquorix.net>
|
||||
Acked-by: Suleiman Souhlal <suleiman@google.com>
|
||||
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
|
||||
Tested-by: Donald Carr <d@chaos-reins.com>
|
||||
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
|
||||
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
|
||||
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
|
||||
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
|
||||
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
|
||||
Cc: Andi Kleen <ak@linux.intel.com>
|
||||
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
|
||||
Cc: Barry Song <baohua@kernel.org>
|
||||
Cc: Catalin Marinas <catalin.marinas@arm.com>
|
||||
Cc: Dave Hansen <dave.hansen@linux.intel.com>
|
||||
Cc: Hillf Danton <hdanton@sina.com>
|
||||
Cc: Jens Axboe <axboe@kernel.dk>
|
||||
Cc: Johannes Weiner <hannes@cmpxchg.org>
|
||||
Cc: Jonathan Corbet <corbet@lwn.net>
|
||||
Cc: Linus Torvalds <torvalds@linux-foundation.org>
|
||||
Cc: Matthew Wilcox <willy@infradead.org>
|
||||
Cc: Mel Gorman <mgorman@suse.de>
|
||||
Cc: Miaohe Lin <linmiaohe@huawei.com>
|
||||
Cc: Michael Larabel <Michael@MichaelLarabel.com>
|
||||
Cc: Michal Hocko <mhocko@kernel.org>
|
||||
Cc: Mike Rapoport <rppt@kernel.org>
|
||||
Cc: Mike Rapoport <rppt@linux.ibm.com>
|
||||
Cc: Peter Zijlstra <peterz@infradead.org>
|
||||
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
|
||||
Cc: Tejun Heo <tj@kernel.org>
|
||||
Cc: Vlastimil Babka <vbabka@suse.cz>
|
||||
Cc: Will Deacon <will@kernel.org>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
include/linux/cgroup.h | 15 ++-
|
||||
include/linux/mm_inline.h | 15 ++-
|
||||
include/linux/mmzone.h | 9 ++
|
||||
kernel/cgroup/cgroup-internal.h | 1 -
|
||||
mm/Kconfig | 6 +
|
||||
mm/vmscan.c | 228 +++++++++++++++++++++++++++++++-
|
||||
6 files changed, 265 insertions(+), 9 deletions(-)
|
||||
|
||||
--- a/include/linux/cgroup.h
|
||||
+++ b/include/linux/cgroup.h
|
||||
@@ -433,6 +433,18 @@ static inline void cgroup_put(struct cgr
|
||||
css_put(&cgrp->self);
|
||||
}
|
||||
|
||||
+extern struct mutex cgroup_mutex;
|
||||
+
|
||||
+static inline void cgroup_lock(void)
|
||||
+{
|
||||
+ mutex_lock(&cgroup_mutex);
|
||||
+}
|
||||
+
|
||||
+static inline void cgroup_unlock(void)
|
||||
+{
|
||||
+ mutex_unlock(&cgroup_mutex);
|
||||
+}
|
||||
+
|
||||
/**
|
||||
* task_css_set_check - obtain a task's css_set with extra access conditions
|
||||
* @task: the task to obtain css_set for
|
||||
@@ -447,7 +459,6 @@ static inline void cgroup_put(struct cgr
|
||||
* as locks used during the cgroup_subsys::attach() methods.
|
||||
*/
|
||||
#ifdef CONFIG_PROVE_RCU
|
||||
-extern struct mutex cgroup_mutex;
|
||||
extern spinlock_t css_set_lock;
|
||||
#define task_css_set_check(task, __c) \
|
||||
rcu_dereference_check((task)->cgroups, \
|
||||
@@ -708,6 +719,8 @@ struct cgroup;
|
||||
static inline u64 cgroup_id(const struct cgroup *cgrp) { return 1; }
|
||||
static inline void css_get(struct cgroup_subsys_state *css) {}
|
||||
static inline void css_put(struct cgroup_subsys_state *css) {}
|
||||
+static inline void cgroup_lock(void) {}
|
||||
+static inline void cgroup_unlock(void) {}
|
||||
static inline int cgroup_attach_task_all(struct task_struct *from,
|
||||
struct task_struct *t) { return 0; }
|
||||
static inline int cgroupstats_build(struct cgroupstats *stats,
|
||||
--- a/include/linux/mm_inline.h
|
||||
+++ b/include/linux/mm_inline.h
|
||||
@@ -91,10 +91,21 @@ static __always_inline enum lru_list pag
|
||||
|
||||
#ifdef CONFIG_LRU_GEN
|
||||
|
||||
+#ifdef CONFIG_LRU_GEN_ENABLED
|
||||
static inline bool lru_gen_enabled(void)
|
||||
{
|
||||
- return true;
|
||||
+ DECLARE_STATIC_KEY_TRUE(lru_gen_caps[NR_LRU_GEN_CAPS]);
|
||||
+
|
||||
+ return static_branch_likely(&lru_gen_caps[LRU_GEN_CORE]);
|
||||
+}
|
||||
+#else
|
||||
+static inline bool lru_gen_enabled(void)
|
||||
+{
|
||||
+ DECLARE_STATIC_KEY_FALSE(lru_gen_caps[NR_LRU_GEN_CAPS]);
|
||||
+
|
||||
+ return static_branch_unlikely(&lru_gen_caps[LRU_GEN_CORE]);
|
||||
}
|
||||
+#endif
|
||||
|
||||
static inline bool lru_gen_in_fault(void)
|
||||
{
|
||||
@@ -207,7 +218,7 @@ static inline bool lru_gen_add_page(stru
|
||||
|
||||
VM_WARN_ON_ONCE_PAGE(gen != -1, page);
|
||||
|
||||
- if (PageUnevictable(page))
|
||||
+ if (PageUnevictable(page) || !lrugen->enabled)
|
||||
return false;
|
||||
/*
|
||||
* There are three common cases for this page:
|
||||
--- a/include/linux/mmzone.h
|
||||
+++ b/include/linux/mmzone.h
|
||||
@@ -364,6 +364,13 @@ enum {
|
||||
LRU_GEN_FILE,
|
||||
};
|
||||
|
||||
+enum {
|
||||
+ LRU_GEN_CORE,
|
||||
+ LRU_GEN_MM_WALK,
|
||||
+ LRU_GEN_NONLEAF_YOUNG,
|
||||
+ NR_LRU_GEN_CAPS
|
||||
+};
|
||||
+
|
||||
#define MIN_LRU_BATCH BITS_PER_LONG
|
||||
#define MAX_LRU_BATCH (MIN_LRU_BATCH * 64)
|
||||
|
||||
@@ -405,6 +412,8 @@ struct lru_gen_struct {
|
||||
/* can be modified without holding the LRU lock */
|
||||
atomic_long_t evicted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
|
||||
atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
|
||||
+ /* whether the multi-gen LRU is enabled */
|
||||
+ bool enabled;
|
||||
};
|
||||
|
||||
enum {
|
||||
--- a/kernel/cgroup/cgroup-internal.h
|
||||
+++ b/kernel/cgroup/cgroup-internal.h
|
||||
@@ -165,7 +165,6 @@ struct cgroup_mgctx {
|
||||
#define DEFINE_CGROUP_MGCTX(name) \
|
||||
struct cgroup_mgctx name = CGROUP_MGCTX_INIT(name)
|
||||
|
||||
-extern struct mutex cgroup_mutex;
|
||||
extern spinlock_t css_set_lock;
|
||||
extern struct cgroup_subsys *cgroup_subsys[];
|
||||
extern struct list_head cgroup_roots;
|
||||
--- a/mm/Kconfig
|
||||
+++ b/mm/Kconfig
|
||||
@@ -906,6 +906,12 @@ config LRU_GEN
|
||||
help
|
||||
A high performance LRU implementation to overcommit memory.
|
||||
|
||||
+config LRU_GEN_ENABLED
|
||||
+ bool "Enable by default"
|
||||
+ depends on LRU_GEN
|
||||
+ help
|
||||
+ This option enables the multi-gen LRU by default.
|
||||
+
|
||||
config LRU_GEN_STATS
|
||||
bool "Full stats for debugging"
|
||||
depends on LRU_GEN
|
||||
--- a/mm/vmscan.c
|
||||
+++ b/mm/vmscan.c
|
||||
@@ -52,6 +52,7 @@
|
||||
#include <linux/psi.h>
|
||||
#include <linux/pagewalk.h>
|
||||
#include <linux/shmem_fs.h>
|
||||
+#include <linux/ctype.h>
|
||||
|
||||
#include <asm/tlbflush.h>
|
||||
#include <asm/div64.h>
|
||||
@@ -2841,6 +2842,14 @@ static bool can_age_anon_pages(struct pg
|
||||
|
||||
#ifdef CONFIG_LRU_GEN
|
||||
|
||||
+#ifdef CONFIG_LRU_GEN_ENABLED
|
||||
+DEFINE_STATIC_KEY_ARRAY_TRUE(lru_gen_caps, NR_LRU_GEN_CAPS);
|
||||
+#define get_cap(cap) static_branch_likely(&lru_gen_caps[cap])
|
||||
+#else
|
||||
+DEFINE_STATIC_KEY_ARRAY_FALSE(lru_gen_caps, NR_LRU_GEN_CAPS);
|
||||
+#define get_cap(cap) static_branch_unlikely(&lru_gen_caps[cap])
|
||||
+#endif
|
||||
+
|
||||
/******************************************************************************
|
||||
* shorthand helpers
|
||||
******************************************************************************/
|
||||
@@ -3717,7 +3726,8 @@ static void walk_pmd_range_locked(pud_t
|
||||
goto next;
|
||||
|
||||
if (!pmd_trans_huge(pmd[i])) {
|
||||
- if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG))
|
||||
+ if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) &&
|
||||
+ get_cap(LRU_GEN_NONLEAF_YOUNG))
|
||||
pmdp_test_and_clear_young(vma, addr, pmd + i);
|
||||
goto next;
|
||||
}
|
||||
@@ -3815,10 +3825,12 @@ restart:
|
||||
walk->mm_stats[MM_NONLEAF_TOTAL]++;
|
||||
|
||||
#ifdef CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG
|
||||
- if (!pmd_young(val))
|
||||
- continue;
|
||||
+ if (get_cap(LRU_GEN_NONLEAF_YOUNG)) {
|
||||
+ if (!pmd_young(val))
|
||||
+ continue;
|
||||
|
||||
- walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos);
|
||||
+ walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos);
|
||||
+ }
|
||||
#endif
|
||||
if (!walk->force_scan && !test_bloom_filter(walk->lruvec, walk->max_seq, pmd + i))
|
||||
continue;
|
||||
@@ -4080,7 +4092,7 @@ static bool try_to_inc_max_seq(struct lr
|
||||
* handful of PTEs. Spreading the work out over a period of time usually
|
||||
* is less efficient, but it avoids bursty page faults.
|
||||
*/
|
||||
- if (!arch_has_hw_pte_young()) {
|
||||
+ if (!(arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK))) {
|
||||
success = iterate_mm_list_nowalk(lruvec, max_seq);
|
||||
goto done;
|
||||
}
|
||||
@@ -4846,6 +4858,208 @@ done:
|
||||
}
|
||||
|
||||
/******************************************************************************
|
||||
+ * state change
|
||||
+ ******************************************************************************/
|
||||
+
|
||||
+static bool __maybe_unused state_is_valid(struct lruvec *lruvec)
|
||||
+{
|
||||
+ struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
||||
+
|
||||
+ if (lrugen->enabled) {
|
||||
+ enum lru_list lru;
|
||||
+
|
||||
+ for_each_evictable_lru(lru) {
|
||||
+ if (!list_empty(&lruvec->lists[lru]))
|
||||
+ return false;
|
||||
+ }
|
||||
+ } else {
|
||||
+ int gen, type, zone;
|
||||
+
|
||||
+ for_each_gen_type_zone(gen, type, zone) {
|
||||
+ if (!list_empty(&lrugen->lists[gen][type][zone]))
|
||||
+ return false;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ return true;
|
||||
+}
|
||||
+
|
||||
+static bool fill_evictable(struct lruvec *lruvec)
|
||||
+{
|
||||
+ enum lru_list lru;
|
||||
+ int remaining = MAX_LRU_BATCH;
|
||||
+
|
||||
+ for_each_evictable_lru(lru) {
|
||||
+ int type = is_file_lru(lru);
|
||||
+ bool active = is_active_lru(lru);
|
||||
+ struct list_head *head = &lruvec->lists[lru];
|
||||
+
|
||||
+ while (!list_empty(head)) {
|
||||
+ bool success;
|
||||
+ struct page *page = lru_to_page(head);
|
||||
+
|
||||
+ VM_WARN_ON_ONCE_PAGE(PageUnevictable(page), page);
|
||||
+ VM_WARN_ON_ONCE_PAGE(PageActive(page) != active, page);
|
||||
+ VM_WARN_ON_ONCE_PAGE(page_is_file_lru(page) != type, page);
|
||||
+ VM_WARN_ON_ONCE_PAGE(page_lru_gen(page) != -1, page);
|
||||
+
|
||||
+ del_page_from_lru_list(page, lruvec);
|
||||
+ success = lru_gen_add_page(lruvec, page, false);
|
||||
+ VM_WARN_ON_ONCE(!success);
|
||||
+
|
||||
+ if (!--remaining)
|
||||
+ return false;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ return true;
|
||||
+}
|
||||
+
|
||||
+static bool drain_evictable(struct lruvec *lruvec)
|
||||
+{
|
||||
+ int gen, type, zone;
|
||||
+ int remaining = MAX_LRU_BATCH;
|
||||
+
|
||||
+ for_each_gen_type_zone(gen, type, zone) {
|
||||
+ struct list_head *head = &lruvec->lrugen.lists[gen][type][zone];
|
||||
+
|
||||
+ while (!list_empty(head)) {
|
||||
+ bool success;
|
||||
+ struct page *page = lru_to_page(head);
|
||||
+
|
||||
+ VM_WARN_ON_ONCE_PAGE(PageUnevictable(page), page);
|
||||
+ VM_WARN_ON_ONCE_PAGE(PageActive(page), page);
|
||||
+ VM_WARN_ON_ONCE_PAGE(page_is_file_lru(page) != type, page);
|
||||
+ VM_WARN_ON_ONCE_PAGE(page_zonenum(page) != zone, page);
|
||||
+
|
||||
+ success = lru_gen_del_page(lruvec, page, false);
|
||||
+ VM_WARN_ON_ONCE(!success);
|
||||
+ add_page_to_lru_list(page, lruvec);
|
||||
+
|
||||
+ if (!--remaining)
|
||||
+ return false;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ return true;
|
||||
+}
|
||||
+
|
||||
+static void lru_gen_change_state(bool enabled)
|
||||
+{
|
||||
+ static DEFINE_MUTEX(state_mutex);
|
||||
+
|
||||
+ struct mem_cgroup *memcg;
|
||||
+
|
||||
+ cgroup_lock();
|
||||
+ cpus_read_lock();
|
||||
+ get_online_mems();
|
||||
+ mutex_lock(&state_mutex);
|
||||
+
|
||||
+ if (enabled == lru_gen_enabled())
|
||||
+ goto unlock;
|
||||
+
|
||||
+ if (enabled)
|
||||
+ static_branch_enable_cpuslocked(&lru_gen_caps[LRU_GEN_CORE]);
|
||||
+ else
|
||||
+ static_branch_disable_cpuslocked(&lru_gen_caps[LRU_GEN_CORE]);
|
||||
+
|
||||
+ memcg = mem_cgroup_iter(NULL, NULL, NULL);
|
||||
+ do {
|
||||
+ int nid;
|
||||
+
|
||||
+ for_each_node(nid) {
|
||||
+ struct lruvec *lruvec = get_lruvec(memcg, nid);
|
||||
+
|
||||
+ if (!lruvec)
|
||||
+ continue;
|
||||
+
|
||||
+ spin_lock_irq(&lruvec->lru_lock);
|
||||
+
|
||||
+ VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
|
||||
+ VM_WARN_ON_ONCE(!state_is_valid(lruvec));
|
||||
+
|
||||
+ lruvec->lrugen.enabled = enabled;
|
||||
+
|
||||
+ while (!(enabled ? fill_evictable(lruvec) : drain_evictable(lruvec))) {
|
||||
+ spin_unlock_irq(&lruvec->lru_lock);
|
||||
+ cond_resched();
|
||||
+ spin_lock_irq(&lruvec->lru_lock);
|
||||
+ }
|
||||
+
|
||||
+ spin_unlock_irq(&lruvec->lru_lock);
|
||||
+ }
|
||||
+
|
||||
+ cond_resched();
|
||||
+ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
|
||||
+unlock:
|
||||
+ mutex_unlock(&state_mutex);
|
||||
+ put_online_mems();
|
||||
+ cpus_read_unlock();
|
||||
+ cgroup_unlock();
|
||||
+}
|
||||
+
|
||||
+/******************************************************************************
|
||||
+ * sysfs interface
|
||||
+ ******************************************************************************/
|
||||
+
|
||||
+static ssize_t show_enabled(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
|
||||
+{
|
||||
+ unsigned int caps = 0;
|
||||
+
|
||||
+ if (get_cap(LRU_GEN_CORE))
|
||||
+ caps |= BIT(LRU_GEN_CORE);
|
||||
+
|
||||
+ if (arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK))
|
||||
+ caps |= BIT(LRU_GEN_MM_WALK);
|
||||
+
|
||||
+ if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) && get_cap(LRU_GEN_NONLEAF_YOUNG))
|
||||
+ caps |= BIT(LRU_GEN_NONLEAF_YOUNG);
|
||||
+
|
||||
+ return snprintf(buf, PAGE_SIZE, "0x%04x\n", caps);
|
||||
+}
|
||||
+
|
||||
+static ssize_t store_enabled(struct kobject *kobj, struct kobj_attribute *attr,
|
||||
+ const char *buf, size_t len)
|
||||
+{
|
||||
+ int i;
|
||||
+ unsigned int caps;
|
||||
+
|
||||
+ if (tolower(*buf) == 'n')
|
||||
+ caps = 0;
|
||||
+ else if (tolower(*buf) == 'y')
|
||||
+ caps = -1;
|
||||
+ else if (kstrtouint(buf, 0, &caps))
|
||||
+ return -EINVAL;
|
||||
+
|
||||
+ for (i = 0; i < NR_LRU_GEN_CAPS; i++) {
|
||||
+ bool enabled = caps & BIT(i);
|
||||
+
|
||||
+ if (i == LRU_GEN_CORE)
|
||||
+ lru_gen_change_state(enabled);
|
||||
+ else if (enabled)
|
||||
+ static_branch_enable(&lru_gen_caps[i]);
|
||||
+ else
|
||||
+ static_branch_disable(&lru_gen_caps[i]);
|
||||
+ }
|
||||
+
|
||||
+ return len;
|
||||
+}
|
||||
+
|
||||
+static struct kobj_attribute lru_gen_enabled_attr = __ATTR(
|
||||
+ enabled, 0644, show_enabled, store_enabled
|
||||
+);
|
||||
+
|
||||
+static struct attribute *lru_gen_attrs[] = {
|
||||
+ &lru_gen_enabled_attr.attr,
|
||||
+ NULL
|
||||
+};
|
||||
+
|
||||
+static struct attribute_group lru_gen_attr_group = {
|
||||
+ .name = "lru_gen",
|
||||
+ .attrs = lru_gen_attrs,
|
||||
+};
|
||||
+
|
||||
+/******************************************************************************
|
||||
* initialization
|
||||
******************************************************************************/
|
||||
|
||||
@@ -4855,6 +5069,7 @@ void lru_gen_init_lruvec(struct lruvec *
|
||||
struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
||||
|
||||
lrugen->max_seq = MIN_NR_GENS + 1;
|
||||
+ lrugen->enabled = lru_gen_enabled();
|
||||
|
||||
for_each_gen_type_zone(gen, type, zone)
|
||||
INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]);
|
||||
@@ -4894,6 +5109,9 @@ static int __init init_lru_gen(void)
|
||||
BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS);
|
||||
BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS);
|
||||
|
||||
+ if (sysfs_create_group(mm_kobj, &lru_gen_attr_group))
|
||||
+ pr_err("lru_gen: failed to create sysfs group\n");
|
||||
+
|
||||
return 0;
|
||||
};
|
||||
late_initcall(init_lru_gen);
|
@ -0,0 +1,226 @@
|
||||
From 73d1ff551760f0c79c47ab70faa4c2ca91413f5c Mon Sep 17 00:00:00 2001
|
||||
From: Yu Zhao <yuzhao@google.com>
|
||||
Date: Sun, 18 Sep 2022 02:00:08 -0600
|
||||
Subject: [PATCH 11/29] mm: multi-gen LRU: thrashing prevention
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
Add /sys/kernel/mm/lru_gen/min_ttl_ms for thrashing prevention, as
|
||||
requested by many desktop users [1].
|
||||
|
||||
When set to value N, it prevents the working set of N milliseconds from
|
||||
getting evicted. The OOM killer is triggered if this working set cannot
|
||||
be kept in memory. Based on the average human detectable lag (~100ms),
|
||||
N=1000 usually eliminates intolerable lags due to thrashing. Larger
|
||||
values like N=3000 make lags less noticeable at the risk of premature OOM
|
||||
kills.
|
||||
|
||||
Compared with the size-based approach [2], this time-based approach
|
||||
has the following advantages:
|
||||
|
||||
1. It is easier to configure because it is agnostic to applications
|
||||
and memory sizes.
|
||||
2. It is more reliable because it is directly wired to the OOM killer.
|
||||
|
||||
[1] https://lore.kernel.org/r/Ydza%2FzXKY9ATRoh6@google.com/
|
||||
[2] https://lore.kernel.org/r/20101028191523.GA14972@google.com/
|
||||
|
||||
Link: https://lkml.kernel.org/r/20220918080010.2920238-12-yuzhao@google.com
|
||||
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
||||
Acked-by: Brian Geffon <bgeffon@google.com>
|
||||
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
|
||||
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
|
||||
Acked-by: Steven Barrett <steven@liquorix.net>
|
||||
Acked-by: Suleiman Souhlal <suleiman@google.com>
|
||||
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
|
||||
Tested-by: Donald Carr <d@chaos-reins.com>
|
||||
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
|
||||
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
|
||||
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
|
||||
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
|
||||
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
|
||||
Cc: Andi Kleen <ak@linux.intel.com>
|
||||
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
|
||||
Cc: Barry Song <baohua@kernel.org>
|
||||
Cc: Catalin Marinas <catalin.marinas@arm.com>
|
||||
Cc: Dave Hansen <dave.hansen@linux.intel.com>
|
||||
Cc: Hillf Danton <hdanton@sina.com>
|
||||
Cc: Jens Axboe <axboe@kernel.dk>
|
||||
Cc: Johannes Weiner <hannes@cmpxchg.org>
|
||||
Cc: Jonathan Corbet <corbet@lwn.net>
|
||||
Cc: Linus Torvalds <torvalds@linux-foundation.org>
|
||||
Cc: Matthew Wilcox <willy@infradead.org>
|
||||
Cc: Mel Gorman <mgorman@suse.de>
|
||||
Cc: Miaohe Lin <linmiaohe@huawei.com>
|
||||
Cc: Michael Larabel <Michael@MichaelLarabel.com>
|
||||
Cc: Michal Hocko <mhocko@kernel.org>
|
||||
Cc: Mike Rapoport <rppt@kernel.org>
|
||||
Cc: Mike Rapoport <rppt@linux.ibm.com>
|
||||
Cc: Peter Zijlstra <peterz@infradead.org>
|
||||
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
|
||||
Cc: Tejun Heo <tj@kernel.org>
|
||||
Cc: Vlastimil Babka <vbabka@suse.cz>
|
||||
Cc: Will Deacon <will@kernel.org>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
include/linux/mmzone.h | 2 ++
|
||||
mm/vmscan.c | 74 ++++++++++++++++++++++++++++++++++++++++--
|
||||
2 files changed, 73 insertions(+), 3 deletions(-)
|
||||
|
||||
--- a/include/linux/mmzone.h
|
||||
+++ b/include/linux/mmzone.h
|
||||
@@ -399,6 +399,8 @@ struct lru_gen_struct {
|
||||
unsigned long max_seq;
|
||||
/* the eviction increments the oldest generation numbers */
|
||||
unsigned long min_seq[ANON_AND_FILE];
|
||||
+ /* the birth time of each generation in jiffies */
|
||||
+ unsigned long timestamps[MAX_NR_GENS];
|
||||
/* the multi-gen LRU lists, lazily sorted on eviction */
|
||||
struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
|
||||
/* the multi-gen LRU sizes, eventually consistent */
|
||||
--- a/mm/vmscan.c
|
||||
+++ b/mm/vmscan.c
|
||||
@@ -4064,6 +4064,7 @@ static void inc_max_seq(struct lruvec *l
|
||||
for (type = 0; type < ANON_AND_FILE; type++)
|
||||
reset_ctrl_pos(lruvec, type, false);
|
||||
|
||||
+ WRITE_ONCE(lrugen->timestamps[next], jiffies);
|
||||
/* make sure preceding modifications appear */
|
||||
smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1);
|
||||
|
||||
@@ -4193,7 +4194,7 @@ static bool should_run_aging(struct lruv
|
||||
return false;
|
||||
}
|
||||
|
||||
-static void age_lruvec(struct lruvec *lruvec, struct scan_control *sc)
|
||||
+static bool age_lruvec(struct lruvec *lruvec, struct scan_control *sc, unsigned long min_ttl)
|
||||
{
|
||||
bool need_aging;
|
||||
unsigned long nr_to_scan;
|
||||
@@ -4207,16 +4208,36 @@ static void age_lruvec(struct lruvec *lr
|
||||
mem_cgroup_calculate_protection(NULL, memcg);
|
||||
|
||||
if (mem_cgroup_below_min(memcg))
|
||||
- return;
|
||||
+ return false;
|
||||
|
||||
need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, swappiness, &nr_to_scan);
|
||||
+
|
||||
+ if (min_ttl) {
|
||||
+ int gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]);
|
||||
+ unsigned long birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);
|
||||
+
|
||||
+ if (time_is_after_jiffies(birth + min_ttl))
|
||||
+ return false;
|
||||
+
|
||||
+ /* the size is likely too small to be helpful */
|
||||
+ if (!nr_to_scan && sc->priority != DEF_PRIORITY)
|
||||
+ return false;
|
||||
+ }
|
||||
+
|
||||
if (need_aging)
|
||||
try_to_inc_max_seq(lruvec, max_seq, sc, swappiness);
|
||||
+
|
||||
+ return true;
|
||||
}
|
||||
|
||||
+/* to protect the working set of the last N jiffies */
|
||||
+static unsigned long lru_gen_min_ttl __read_mostly;
|
||||
+
|
||||
static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
|
||||
{
|
||||
struct mem_cgroup *memcg;
|
||||
+ bool success = false;
|
||||
+ unsigned long min_ttl = READ_ONCE(lru_gen_min_ttl);
|
||||
|
||||
VM_WARN_ON_ONCE(!current_is_kswapd());
|
||||
|
||||
@@ -4239,12 +4260,32 @@ static void lru_gen_age_node(struct pgli
|
||||
do {
|
||||
struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
|
||||
|
||||
- age_lruvec(lruvec, sc);
|
||||
+ if (age_lruvec(lruvec, sc, min_ttl))
|
||||
+ success = true;
|
||||
|
||||
cond_resched();
|
||||
} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
|
||||
|
||||
clear_mm_walk();
|
||||
+
|
||||
+ /* check the order to exclude compaction-induced reclaim */
|
||||
+ if (success || !min_ttl || sc->order)
|
||||
+ return;
|
||||
+
|
||||
+ /*
|
||||
+ * The main goal is to OOM kill if every generation from all memcgs is
|
||||
+ * younger than min_ttl. However, another possibility is all memcgs are
|
||||
+ * either below min or empty.
|
||||
+ */
|
||||
+ if (mutex_trylock(&oom_lock)) {
|
||||
+ struct oom_control oc = {
|
||||
+ .gfp_mask = sc->gfp_mask,
|
||||
+ };
|
||||
+
|
||||
+ out_of_memory(&oc);
|
||||
+
|
||||
+ mutex_unlock(&oom_lock);
|
||||
+ }
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -5002,6 +5043,28 @@ unlock:
|
||||
* sysfs interface
|
||||
******************************************************************************/
|
||||
|
||||
+static ssize_t show_min_ttl(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
|
||||
+{
|
||||
+ return sprintf(buf, "%u\n", jiffies_to_msecs(READ_ONCE(lru_gen_min_ttl)));
|
||||
+}
|
||||
+
|
||||
+static ssize_t store_min_ttl(struct kobject *kobj, struct kobj_attribute *attr,
|
||||
+ const char *buf, size_t len)
|
||||
+{
|
||||
+ unsigned int msecs;
|
||||
+
|
||||
+ if (kstrtouint(buf, 0, &msecs))
|
||||
+ return -EINVAL;
|
||||
+
|
||||
+ WRITE_ONCE(lru_gen_min_ttl, msecs_to_jiffies(msecs));
|
||||
+
|
||||
+ return len;
|
||||
+}
|
||||
+
|
||||
+static struct kobj_attribute lru_gen_min_ttl_attr = __ATTR(
|
||||
+ min_ttl_ms, 0644, show_min_ttl, store_min_ttl
|
||||
+);
|
||||
+
|
||||
static ssize_t show_enabled(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
|
||||
{
|
||||
unsigned int caps = 0;
|
||||
@@ -5050,6 +5113,7 @@ static struct kobj_attribute lru_gen_ena
|
||||
);
|
||||
|
||||
static struct attribute *lru_gen_attrs[] = {
|
||||
+ &lru_gen_min_ttl_attr.attr,
|
||||
&lru_gen_enabled_attr.attr,
|
||||
NULL
|
||||
};
|
||||
@@ -5065,12 +5129,16 @@ static struct attribute_group lru_gen_at
|
||||
|
||||
void lru_gen_init_lruvec(struct lruvec *lruvec)
|
||||
{
|
||||
+ int i;
|
||||
int gen, type, zone;
|
||||
struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
||||
|
||||
lrugen->max_seq = MIN_NR_GENS + 1;
|
||||
lrugen->enabled = lru_gen_enabled();
|
||||
|
||||
+ for (i = 0; i <= MIN_NR_GENS + 1; i++)
|
||||
+ lrugen->timestamps[i] = jiffies;
|
||||
+
|
||||
for_each_gen_type_zone(gen, type, zone)
|
||||
INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]);
|
||||
|
@ -0,0 +1,579 @@
|
||||
From 530716d008ca26315f246cd70dc1cefc636beaa4 Mon Sep 17 00:00:00 2001
|
||||
From: Yu Zhao <yuzhao@google.com>
|
||||
Date: Sun, 18 Sep 2022 02:00:09 -0600
|
||||
Subject: [PATCH 12/29] mm: multi-gen LRU: debugfs interface
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
Add /sys/kernel/debug/lru_gen for working set estimation and proactive
|
||||
reclaim. These techniques are commonly used to optimize job scheduling
|
||||
(bin packing) in data centers [1][2].
|
||||
|
||||
Compared with the page table-based approach and the PFN-based
|
||||
approach, this lruvec-based approach has the following advantages:
|
||||
1. It offers better choices because it is aware of memcgs, NUMA nodes,
|
||||
shared mappings and unmapped page cache.
|
||||
2. It is more scalable because it is O(nr_hot_pages), whereas the
|
||||
PFN-based approach is O(nr_total_pages).
|
||||
|
||||
Add /sys/kernel/debug/lru_gen_full for debugging.
|
||||
|
||||
[1] https://dl.acm.org/doi/10.1145/3297858.3304053
|
||||
[2] https://dl.acm.org/doi/10.1145/3503222.3507731
|
||||
|
||||
Link: https://lkml.kernel.org/r/20220918080010.2920238-13-yuzhao@google.com
|
||||
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
||||
Reviewed-by: Qi Zheng <zhengqi.arch@bytedance.com>
|
||||
Acked-by: Brian Geffon <bgeffon@google.com>
|
||||
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
|
||||
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
|
||||
Acked-by: Steven Barrett <steven@liquorix.net>
|
||||
Acked-by: Suleiman Souhlal <suleiman@google.com>
|
||||
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
|
||||
Tested-by: Donald Carr <d@chaos-reins.com>
|
||||
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
|
||||
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
|
||||
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
|
||||
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
|
||||
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
|
||||
Cc: Andi Kleen <ak@linux.intel.com>
|
||||
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
|
||||
Cc: Barry Song <baohua@kernel.org>
|
||||
Cc: Catalin Marinas <catalin.marinas@arm.com>
|
||||
Cc: Dave Hansen <dave.hansen@linux.intel.com>
|
||||
Cc: Hillf Danton <hdanton@sina.com>
|
||||
Cc: Jens Axboe <axboe@kernel.dk>
|
||||
Cc: Johannes Weiner <hannes@cmpxchg.org>
|
||||
Cc: Jonathan Corbet <corbet@lwn.net>
|
||||
Cc: Linus Torvalds <torvalds@linux-foundation.org>
|
||||
Cc: Matthew Wilcox <willy@infradead.org>
|
||||
Cc: Mel Gorman <mgorman@suse.de>
|
||||
Cc: Miaohe Lin <linmiaohe@huawei.com>
|
||||
Cc: Michael Larabel <Michael@MichaelLarabel.com>
|
||||
Cc: Michal Hocko <mhocko@kernel.org>
|
||||
Cc: Mike Rapoport <rppt@kernel.org>
|
||||
Cc: Mike Rapoport <rppt@linux.ibm.com>
|
||||
Cc: Peter Zijlstra <peterz@infradead.org>
|
||||
Cc: Tejun Heo <tj@kernel.org>
|
||||
Cc: Vlastimil Babka <vbabka@suse.cz>
|
||||
Cc: Will Deacon <will@kernel.org>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
include/linux/nodemask.h | 1 +
|
||||
mm/vmscan.c | 411 ++++++++++++++++++++++++++++++++++++++-
|
||||
2 files changed, 402 insertions(+), 10 deletions(-)
|
||||
|
||||
--- a/include/linux/nodemask.h
|
||||
+++ b/include/linux/nodemask.h
|
||||
@@ -485,6 +485,7 @@ static inline int num_node_state(enum no
|
||||
#define first_online_node 0
|
||||
#define first_memory_node 0
|
||||
#define next_online_node(nid) (MAX_NUMNODES)
|
||||
+#define next_memory_node(nid) (MAX_NUMNODES)
|
||||
#define nr_node_ids 1U
|
||||
#define nr_online_nodes 1U
|
||||
|
||||
--- a/mm/vmscan.c
|
||||
+++ b/mm/vmscan.c
|
||||
@@ -53,6 +53,7 @@
|
||||
#include <linux/pagewalk.h>
|
||||
#include <linux/shmem_fs.h>
|
||||
#include <linux/ctype.h>
|
||||
+#include <linux/debugfs.h>
|
||||
|
||||
#include <asm/tlbflush.h>
|
||||
#include <asm/div64.h>
|
||||
@@ -3968,12 +3969,40 @@ static void clear_mm_walk(void)
|
||||
kfree(walk);
|
||||
}
|
||||
|
||||
-static void inc_min_seq(struct lruvec *lruvec, int type)
|
||||
+static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap)
|
||||
{
|
||||
+ int zone;
|
||||
+ int remaining = MAX_LRU_BATCH;
|
||||
struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
||||
+ int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]);
|
||||
+
|
||||
+ if (type == LRU_GEN_ANON && !can_swap)
|
||||
+ goto done;
|
||||
+
|
||||
+ /* prevent cold/hot inversion if force_scan is true */
|
||||
+ for (zone = 0; zone < MAX_NR_ZONES; zone++) {
|
||||
+ struct list_head *head = &lrugen->lists[old_gen][type][zone];
|
||||
+
|
||||
+ while (!list_empty(head)) {
|
||||
+ struct page *page = lru_to_page(head);
|
||||
+
|
||||
+ VM_WARN_ON_ONCE_PAGE(PageUnevictable(page), page);
|
||||
+ VM_WARN_ON_ONCE_PAGE(PageActive(page), page);
|
||||
+ VM_WARN_ON_ONCE_PAGE(page_is_file_lru(page) != type, page);
|
||||
+ VM_WARN_ON_ONCE_PAGE(page_zonenum(page) != zone, page);
|
||||
|
||||
+ new_gen = page_inc_gen(lruvec, page, false);
|
||||
+ list_move_tail(&page->lru, &lrugen->lists[new_gen][type][zone]);
|
||||
+
|
||||
+ if (!--remaining)
|
||||
+ return false;
|
||||
+ }
|
||||
+ }
|
||||
+done:
|
||||
reset_ctrl_pos(lruvec, type, true);
|
||||
WRITE_ONCE(lrugen->min_seq[type], lrugen->min_seq[type] + 1);
|
||||
+
|
||||
+ return true;
|
||||
}
|
||||
|
||||
static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap)
|
||||
@@ -4019,7 +4048,7 @@ next:
|
||||
return success;
|
||||
}
|
||||
|
||||
-static void inc_max_seq(struct lruvec *lruvec, bool can_swap)
|
||||
+static void inc_max_seq(struct lruvec *lruvec, bool can_swap, bool force_scan)
|
||||
{
|
||||
int prev, next;
|
||||
int type, zone;
|
||||
@@ -4033,9 +4062,13 @@ static void inc_max_seq(struct lruvec *l
|
||||
if (get_nr_gens(lruvec, type) != MAX_NR_GENS)
|
||||
continue;
|
||||
|
||||
- VM_WARN_ON_ONCE(type == LRU_GEN_FILE || can_swap);
|
||||
+ VM_WARN_ON_ONCE(!force_scan && (type == LRU_GEN_FILE || can_swap));
|
||||
|
||||
- inc_min_seq(lruvec, type);
|
||||
+ while (!inc_min_seq(lruvec, type, can_swap)) {
|
||||
+ spin_unlock_irq(&lruvec->lru_lock);
|
||||
+ cond_resched();
|
||||
+ spin_lock_irq(&lruvec->lru_lock);
|
||||
+ }
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -4072,7 +4105,7 @@ static void inc_max_seq(struct lruvec *l
|
||||
}
|
||||
|
||||
static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
|
||||
- struct scan_control *sc, bool can_swap)
|
||||
+ struct scan_control *sc, bool can_swap, bool force_scan)
|
||||
{
|
||||
bool success;
|
||||
struct lru_gen_mm_walk *walk;
|
||||
@@ -4093,7 +4126,7 @@ static bool try_to_inc_max_seq(struct lr
|
||||
* handful of PTEs. Spreading the work out over a period of time usually
|
||||
* is less efficient, but it avoids bursty page faults.
|
||||
*/
|
||||
- if (!(arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK))) {
|
||||
+ if (!force_scan && !(arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK))) {
|
||||
success = iterate_mm_list_nowalk(lruvec, max_seq);
|
||||
goto done;
|
||||
}
|
||||
@@ -4107,7 +4140,7 @@ static bool try_to_inc_max_seq(struct lr
|
||||
walk->lruvec = lruvec;
|
||||
walk->max_seq = max_seq;
|
||||
walk->can_swap = can_swap;
|
||||
- walk->force_scan = false;
|
||||
+ walk->force_scan = force_scan;
|
||||
|
||||
do {
|
||||
success = iterate_mm_list(lruvec, walk, &mm);
|
||||
@@ -4127,7 +4160,7 @@ done:
|
||||
|
||||
VM_WARN_ON_ONCE(max_seq != READ_ONCE(lrugen->max_seq));
|
||||
|
||||
- inc_max_seq(lruvec, can_swap);
|
||||
+ inc_max_seq(lruvec, can_swap, force_scan);
|
||||
/* either this sees any waiters or they will see updated max_seq */
|
||||
if (wq_has_sleeper(&lruvec->mm_state.wait))
|
||||
wake_up_all(&lruvec->mm_state.wait);
|
||||
@@ -4225,7 +4258,7 @@ static bool age_lruvec(struct lruvec *lr
|
||||
}
|
||||
|
||||
if (need_aging)
|
||||
- try_to_inc_max_seq(lruvec, max_seq, sc, swappiness);
|
||||
+ try_to_inc_max_seq(lruvec, max_seq, sc, swappiness, false);
|
||||
|
||||
return true;
|
||||
}
|
||||
@@ -4784,7 +4817,7 @@ static unsigned long get_nr_to_scan(stru
|
||||
if (current_is_kswapd())
|
||||
return 0;
|
||||
|
||||
- if (try_to_inc_max_seq(lruvec, max_seq, sc, can_swap))
|
||||
+ if (try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false))
|
||||
return nr_to_scan;
|
||||
done:
|
||||
return min_seq[!can_swap] + MIN_NR_GENS <= max_seq ? nr_to_scan : 0;
|
||||
@@ -5124,6 +5157,361 @@ static struct attribute_group lru_gen_at
|
||||
};
|
||||
|
||||
/******************************************************************************
|
||||
+ * debugfs interface
|
||||
+ ******************************************************************************/
|
||||
+
|
||||
+static void *lru_gen_seq_start(struct seq_file *m, loff_t *pos)
|
||||
+{
|
||||
+ struct mem_cgroup *memcg;
|
||||
+ loff_t nr_to_skip = *pos;
|
||||
+
|
||||
+ m->private = kvmalloc(PATH_MAX, GFP_KERNEL);
|
||||
+ if (!m->private)
|
||||
+ return ERR_PTR(-ENOMEM);
|
||||
+
|
||||
+ memcg = mem_cgroup_iter(NULL, NULL, NULL);
|
||||
+ do {
|
||||
+ int nid;
|
||||
+
|
||||
+ for_each_node_state(nid, N_MEMORY) {
|
||||
+ if (!nr_to_skip--)
|
||||
+ return get_lruvec(memcg, nid);
|
||||
+ }
|
||||
+ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
|
||||
+
|
||||
+ return NULL;
|
||||
+}
|
||||
+
|
||||
+static void lru_gen_seq_stop(struct seq_file *m, void *v)
|
||||
+{
|
||||
+ if (!IS_ERR_OR_NULL(v))
|
||||
+ mem_cgroup_iter_break(NULL, lruvec_memcg(v));
|
||||
+
|
||||
+ kvfree(m->private);
|
||||
+ m->private = NULL;
|
||||
+}
|
||||
+
|
||||
+static void *lru_gen_seq_next(struct seq_file *m, void *v, loff_t *pos)
|
||||
+{
|
||||
+ int nid = lruvec_pgdat(v)->node_id;
|
||||
+ struct mem_cgroup *memcg = lruvec_memcg(v);
|
||||
+
|
||||
+ ++*pos;
|
||||
+
|
||||
+ nid = next_memory_node(nid);
|
||||
+ if (nid == MAX_NUMNODES) {
|
||||
+ memcg = mem_cgroup_iter(NULL, memcg, NULL);
|
||||
+ if (!memcg)
|
||||
+ return NULL;
|
||||
+
|
||||
+ nid = first_memory_node;
|
||||
+ }
|
||||
+
|
||||
+ return get_lruvec(memcg, nid);
|
||||
+}
|
||||
+
|
||||
+static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec,
|
||||
+ unsigned long max_seq, unsigned long *min_seq,
|
||||
+ unsigned long seq)
|
||||
+{
|
||||
+ int i;
|
||||
+ int type, tier;
|
||||
+ int hist = lru_hist_from_seq(seq);
|
||||
+ struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
||||
+
|
||||
+ for (tier = 0; tier < MAX_NR_TIERS; tier++) {
|
||||
+ seq_printf(m, " %10d", tier);
|
||||
+ for (type = 0; type < ANON_AND_FILE; type++) {
|
||||
+ const char *s = " ";
|
||||
+ unsigned long n[3] = {};
|
||||
+
|
||||
+ if (seq == max_seq) {
|
||||
+ s = "RT ";
|
||||
+ n[0] = READ_ONCE(lrugen->avg_refaulted[type][tier]);
|
||||
+ n[1] = READ_ONCE(lrugen->avg_total[type][tier]);
|
||||
+ } else if (seq == min_seq[type] || NR_HIST_GENS > 1) {
|
||||
+ s = "rep";
|
||||
+ n[0] = atomic_long_read(&lrugen->refaulted[hist][type][tier]);
|
||||
+ n[1] = atomic_long_read(&lrugen->evicted[hist][type][tier]);
|
||||
+ if (tier)
|
||||
+ n[2] = READ_ONCE(lrugen->protected[hist][type][tier - 1]);
|
||||
+ }
|
||||
+
|
||||
+ for (i = 0; i < 3; i++)
|
||||
+ seq_printf(m, " %10lu%c", n[i], s[i]);
|
||||
+ }
|
||||
+ seq_putc(m, '\n');
|
||||
+ }
|
||||
+
|
||||
+ seq_puts(m, " ");
|
||||
+ for (i = 0; i < NR_MM_STATS; i++) {
|
||||
+ const char *s = " ";
|
||||
+ unsigned long n = 0;
|
||||
+
|
||||
+ if (seq == max_seq && NR_HIST_GENS == 1) {
|
||||
+ s = "LOYNFA";
|
||||
+ n = READ_ONCE(lruvec->mm_state.stats[hist][i]);
|
||||
+ } else if (seq != max_seq && NR_HIST_GENS > 1) {
|
||||
+ s = "loynfa";
|
||||
+ n = READ_ONCE(lruvec->mm_state.stats[hist][i]);
|
||||
+ }
|
||||
+
|
||||
+ seq_printf(m, " %10lu%c", n, s[i]);
|
||||
+ }
|
||||
+ seq_putc(m, '\n');
|
||||
+}
|
||||
+
|
||||
+static int lru_gen_seq_show(struct seq_file *m, void *v)
|
||||
+{
|
||||
+ unsigned long seq;
|
||||
+ bool full = !debugfs_real_fops(m->file)->write;
|
||||
+ struct lruvec *lruvec = v;
|
||||
+ struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
||||
+ int nid = lruvec_pgdat(lruvec)->node_id;
|
||||
+ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
|
||||
+ DEFINE_MAX_SEQ(lruvec);
|
||||
+ DEFINE_MIN_SEQ(lruvec);
|
||||
+
|
||||
+ if (nid == first_memory_node) {
|
||||
+ const char *path = memcg ? m->private : "";
|
||||
+
|
||||
+#ifdef CONFIG_MEMCG
|
||||
+ if (memcg)
|
||||
+ cgroup_path(memcg->css.cgroup, m->private, PATH_MAX);
|
||||
+#endif
|
||||
+ seq_printf(m, "memcg %5hu %s\n", mem_cgroup_id(memcg), path);
|
||||
+ }
|
||||
+
|
||||
+ seq_printf(m, " node %5d\n", nid);
|
||||
+
|
||||
+ if (!full)
|
||||
+ seq = min_seq[LRU_GEN_ANON];
|
||||
+ else if (max_seq >= MAX_NR_GENS)
|
||||
+ seq = max_seq - MAX_NR_GENS + 1;
|
||||
+ else
|
||||
+ seq = 0;
|
||||
+
|
||||
+ for (; seq <= max_seq; seq++) {
|
||||
+ int type, zone;
|
||||
+ int gen = lru_gen_from_seq(seq);
|
||||
+ unsigned long birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);
|
||||
+
|
||||
+ seq_printf(m, " %10lu %10u", seq, jiffies_to_msecs(jiffies - birth));
|
||||
+
|
||||
+ for (type = 0; type < ANON_AND_FILE; type++) {
|
||||
+ unsigned long size = 0;
|
||||
+ char mark = full && seq < min_seq[type] ? 'x' : ' ';
|
||||
+
|
||||
+ for (zone = 0; zone < MAX_NR_ZONES; zone++)
|
||||
+ size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L);
|
||||
+
|
||||
+ seq_printf(m, " %10lu%c", size, mark);
|
||||
+ }
|
||||
+
|
||||
+ seq_putc(m, '\n');
|
||||
+
|
||||
+ if (full)
|
||||
+ lru_gen_seq_show_full(m, lruvec, max_seq, min_seq, seq);
|
||||
+ }
|
||||
+
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
+static const struct seq_operations lru_gen_seq_ops = {
|
||||
+ .start = lru_gen_seq_start,
|
||||
+ .stop = lru_gen_seq_stop,
|
||||
+ .next = lru_gen_seq_next,
|
||||
+ .show = lru_gen_seq_show,
|
||||
+};
|
||||
+
|
||||
+static int run_aging(struct lruvec *lruvec, unsigned long seq, struct scan_control *sc,
|
||||
+ bool can_swap, bool force_scan)
|
||||
+{
|
||||
+ DEFINE_MAX_SEQ(lruvec);
|
||||
+ DEFINE_MIN_SEQ(lruvec);
|
||||
+
|
||||
+ if (seq < max_seq)
|
||||
+ return 0;
|
||||
+
|
||||
+ if (seq > max_seq)
|
||||
+ return -EINVAL;
|
||||
+
|
||||
+ if (!force_scan && min_seq[!can_swap] + MAX_NR_GENS - 1 <= max_seq)
|
||||
+ return -ERANGE;
|
||||
+
|
||||
+ try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, force_scan);
|
||||
+
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
+static int run_eviction(struct lruvec *lruvec, unsigned long seq, struct scan_control *sc,
|
||||
+ int swappiness, unsigned long nr_to_reclaim)
|
||||
+{
|
||||
+ DEFINE_MAX_SEQ(lruvec);
|
||||
+
|
||||
+ if (seq + MIN_NR_GENS > max_seq)
|
||||
+ return -EINVAL;
|
||||
+
|
||||
+ sc->nr_reclaimed = 0;
|
||||
+
|
||||
+ while (!signal_pending(current)) {
|
||||
+ DEFINE_MIN_SEQ(lruvec);
|
||||
+
|
||||
+ if (seq < min_seq[!swappiness])
|
||||
+ return 0;
|
||||
+
|
||||
+ if (sc->nr_reclaimed >= nr_to_reclaim)
|
||||
+ return 0;
|
||||
+
|
||||
+ if (!evict_pages(lruvec, sc, swappiness, NULL))
|
||||
+ return 0;
|
||||
+
|
||||
+ cond_resched();
|
||||
+ }
|
||||
+
|
||||
+ return -EINTR;
|
||||
+}
|
||||
+
|
||||
+static int run_cmd(char cmd, int memcg_id, int nid, unsigned long seq,
|
||||
+ struct scan_control *sc, int swappiness, unsigned long opt)
|
||||
+{
|
||||
+ struct lruvec *lruvec;
|
||||
+ int err = -EINVAL;
|
||||
+ struct mem_cgroup *memcg = NULL;
|
||||
+
|
||||
+ if (nid < 0 || nid >= MAX_NUMNODES || !node_state(nid, N_MEMORY))
|
||||
+ return -EINVAL;
|
||||
+
|
||||
+ if (!mem_cgroup_disabled()) {
|
||||
+ rcu_read_lock();
|
||||
+ memcg = mem_cgroup_from_id(memcg_id);
|
||||
+#ifdef CONFIG_MEMCG
|
||||
+ if (memcg && !css_tryget(&memcg->css))
|
||||
+ memcg = NULL;
|
||||
+#endif
|
||||
+ rcu_read_unlock();
|
||||
+
|
||||
+ if (!memcg)
|
||||
+ return -EINVAL;
|
||||
+ }
|
||||
+
|
||||
+ if (memcg_id != mem_cgroup_id(memcg))
|
||||
+ goto done;
|
||||
+
|
||||
+ lruvec = get_lruvec(memcg, nid);
|
||||
+
|
||||
+ if (swappiness < 0)
|
||||
+ swappiness = get_swappiness(lruvec, sc);
|
||||
+ else if (swappiness > 200)
|
||||
+ goto done;
|
||||
+
|
||||
+ switch (cmd) {
|
||||
+ case '+':
|
||||
+ err = run_aging(lruvec, seq, sc, swappiness, opt);
|
||||
+ break;
|
||||
+ case '-':
|
||||
+ err = run_eviction(lruvec, seq, sc, swappiness, opt);
|
||||
+ break;
|
||||
+ }
|
||||
+done:
|
||||
+ mem_cgroup_put(memcg);
|
||||
+
|
||||
+ return err;
|
||||
+}
|
||||
+
|
||||
+static ssize_t lru_gen_seq_write(struct file *file, const char __user *src,
|
||||
+ size_t len, loff_t *pos)
|
||||
+{
|
||||
+ void *buf;
|
||||
+ char *cur, *next;
|
||||
+ unsigned int flags;
|
||||
+ struct blk_plug plug;
|
||||
+ int err = -EINVAL;
|
||||
+ struct scan_control sc = {
|
||||
+ .may_writepage = true,
|
||||
+ .may_unmap = true,
|
||||
+ .may_swap = true,
|
||||
+ .reclaim_idx = MAX_NR_ZONES - 1,
|
||||
+ .gfp_mask = GFP_KERNEL,
|
||||
+ };
|
||||
+
|
||||
+ buf = kvmalloc(len + 1, GFP_KERNEL);
|
||||
+ if (!buf)
|
||||
+ return -ENOMEM;
|
||||
+
|
||||
+ if (copy_from_user(buf, src, len)) {
|
||||
+ kvfree(buf);
|
||||
+ return -EFAULT;
|
||||
+ }
|
||||
+
|
||||
+ set_task_reclaim_state(current, &sc.reclaim_state);
|
||||
+ flags = memalloc_noreclaim_save();
|
||||
+ blk_start_plug(&plug);
|
||||
+ if (!set_mm_walk(NULL)) {
|
||||
+ err = -ENOMEM;
|
||||
+ goto done;
|
||||
+ }
|
||||
+
|
||||
+ next = buf;
|
||||
+ next[len] = '\0';
|
||||
+
|
||||
+ while ((cur = strsep(&next, ",;\n"))) {
|
||||
+ int n;
|
||||
+ int end;
|
||||
+ char cmd;
|
||||
+ unsigned int memcg_id;
|
||||
+ unsigned int nid;
|
||||
+ unsigned long seq;
|
||||
+ unsigned int swappiness = -1;
|
||||
+ unsigned long opt = -1;
|
||||
+
|
||||
+ cur = skip_spaces(cur);
|
||||
+ if (!*cur)
|
||||
+ continue;
|
||||
+
|
||||
+ n = sscanf(cur, "%c %u %u %lu %n %u %n %lu %n", &cmd, &memcg_id, &nid,
|
||||
+ &seq, &end, &swappiness, &end, &opt, &end);
|
||||
+ if (n < 4 || cur[end]) {
|
||||
+ err = -EINVAL;
|
||||
+ break;
|
||||
+ }
|
||||
+
|
||||
+ err = run_cmd(cmd, memcg_id, nid, seq, &sc, swappiness, opt);
|
||||
+ if (err)
|
||||
+ break;
|
||||
+ }
|
||||
+done:
|
||||
+ clear_mm_walk();
|
||||
+ blk_finish_plug(&plug);
|
||||
+ memalloc_noreclaim_restore(flags);
|
||||
+ set_task_reclaim_state(current, NULL);
|
||||
+
|
||||
+ kvfree(buf);
|
||||
+
|
||||
+ return err ? : len;
|
||||
+}
|
||||
+
|
||||
+static int lru_gen_seq_open(struct inode *inode, struct file *file)
|
||||
+{
|
||||
+ return seq_open(file, &lru_gen_seq_ops);
|
||||
+}
|
||||
+
|
||||
+static const struct file_operations lru_gen_rw_fops = {
|
||||
+ .open = lru_gen_seq_open,
|
||||
+ .read = seq_read,
|
||||
+ .write = lru_gen_seq_write,
|
||||
+ .llseek = seq_lseek,
|
||||
+ .release = seq_release,
|
||||
+};
|
||||
+
|
||||
+static const struct file_operations lru_gen_ro_fops = {
|
||||
+ .open = lru_gen_seq_open,
|
||||
+ .read = seq_read,
|
||||
+ .llseek = seq_lseek,
|
||||
+ .release = seq_release,
|
||||
+};
|
||||
+
|
||||
+/******************************************************************************
|
||||
* initialization
|
||||
******************************************************************************/
|
||||
|
||||
@@ -5180,6 +5568,9 @@ static int __init init_lru_gen(void)
|
||||
if (sysfs_create_group(mm_kobj, &lru_gen_attr_group))
|
||||
pr_err("lru_gen: failed to create sysfs group\n");
|
||||
|
||||
+ debugfs_create_file("lru_gen", 0644, NULL, NULL, &lru_gen_rw_fops);
|
||||
+ debugfs_create_file("lru_gen_full", 0444, NULL, NULL, &lru_gen_ro_fops);
|
||||
+
|
||||
return 0;
|
||||
};
|
||||
late_initcall(init_lru_gen);
|
@ -0,0 +1,32 @@
|
||||
From 92d430e8955c976eacb7cc91d7ff849c0dd009af Mon Sep 17 00:00:00 2001
|
||||
From: Yu Zhao <yuzhao@google.com>
|
||||
Date: Wed, 28 Sep 2022 13:36:58 -0600
|
||||
Subject: [PATCH 13/29] mm/mglru: don't sync disk for each aging cycle
|
||||
|
||||
wakeup_flusher_threads() was added under the assumption that if a system
|
||||
runs out of clean cold pages, it might want to write back dirty pages more
|
||||
aggressively so that they can become clean and be dropped.
|
||||
|
||||
However, doing so can breach the rate limit a system wants to impose on
|
||||
writeback, resulting in early SSD wearout.
|
||||
|
||||
Link: https://lkml.kernel.org/r/YzSiWq9UEER5LKup@google.com
|
||||
Fixes: bd74fdaea146 ("mm: multi-gen LRU: support page table walks")
|
||||
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
||||
Reported-by: Axel Rasmussen <axelrasmussen@google.com>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
mm/vmscan.c | 2 --
|
||||
1 file changed, 2 deletions(-)
|
||||
|
||||
--- a/mm/vmscan.c
|
||||
+++ b/mm/vmscan.c
|
||||
@@ -4165,8 +4165,6 @@ done:
|
||||
if (wq_has_sleeper(&lruvec->mm_state.wait))
|
||||
wake_up_all(&lruvec->mm_state.wait);
|
||||
|
||||
- wakeup_flusher_threads(WB_REASON_VMSCAN);
|
||||
-
|
||||
return true;
|
||||
}
|
||||
|
@ -0,0 +1,124 @@
|
||||
From 6f315879ad750391a0b1fab8c9170bc054a5f5d7 Mon Sep 17 00:00:00 2001
|
||||
From: Yu Zhao <yuzhao@google.com>
|
||||
Date: Tue, 15 Nov 2022 18:38:07 -0700
|
||||
Subject: [PATCH 14/29] mm: multi-gen LRU: retry pages written back while
|
||||
isolated
|
||||
|
||||
The page reclaim isolates a batch of pages from the tail of one of the
|
||||
LRU lists and works on those pages one by one. For a suitable
|
||||
swap-backed page, if the swap device is async, it queues that page for
|
||||
writeback. After the page reclaim finishes an entire batch, it puts back
|
||||
the pages it queued for writeback to the head of the original LRU list.
|
||||
|
||||
In the meantime, the page writeback flushes the queued pages also by
|
||||
batches. Its batching logic is independent from that of the page reclaim.
|
||||
For each of the pages it writes back, the page writeback calls
|
||||
rotate_reclaimable_page() which tries to rotate a page to the tail.
|
||||
|
||||
rotate_reclaimable_page() only works for a page after the page reclaim
|
||||
has put it back. If an async swap device is fast enough, the page
|
||||
writeback can finish with that page while the page reclaim is still
|
||||
working on the rest of the batch containing it. In this case, that page
|
||||
will remain at the head and the page reclaim will not retry it before
|
||||
reaching there.
|
||||
|
||||
This patch adds a retry to evict_pages(). After evict_pages() has
|
||||
finished an entire batch and before it puts back pages it cannot free
|
||||
immediately, it retries those that may have missed the rotation.
|
||||
|
||||
Before this patch, ~60% of pages swapped to an Intel Optane missed
|
||||
rotate_reclaimable_page(). After this patch, ~99% of missed pages were
|
||||
reclaimed upon retry.
|
||||
|
||||
This problem affects relatively slow async swap devices like Samsung 980
|
||||
Pro much less and does not affect sync swap devices like zram or zswap at
|
||||
all.
|
||||
|
||||
Link: https://lkml.kernel.org/r/20221116013808.3995280-1-yuzhao@google.com
|
||||
Fixes: ac35a4902374 ("mm: multi-gen LRU: minimal implementation")
|
||||
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
||||
Cc: "Yin, Fengwei" <fengwei.yin@intel.com>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
mm/vmscan.c | 48 +++++++++++++++++++++++++++++++++++++-----------
|
||||
1 file changed, 37 insertions(+), 11 deletions(-)
|
||||
|
||||
--- a/mm/vmscan.c
|
||||
+++ b/mm/vmscan.c
|
||||
@@ -4723,10 +4723,13 @@ static int evict_pages(struct lruvec *lr
|
||||
int scanned;
|
||||
int reclaimed;
|
||||
LIST_HEAD(list);
|
||||
+ LIST_HEAD(clean);
|
||||
struct page *page;
|
||||
+ struct page *next;
|
||||
enum vm_event_item item;
|
||||
struct reclaim_stat stat;
|
||||
struct lru_gen_mm_walk *walk;
|
||||
+ bool skip_retry = false;
|
||||
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
|
||||
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
|
||||
|
||||
@@ -4743,20 +4746,37 @@ static int evict_pages(struct lruvec *lr
|
||||
|
||||
if (list_empty(&list))
|
||||
return scanned;
|
||||
-
|
||||
+retry:
|
||||
reclaimed = shrink_page_list(&list, pgdat, sc, &stat, false);
|
||||
+ sc->nr_reclaimed += reclaimed;
|
||||
|
||||
- list_for_each_entry(page, &list, lru) {
|
||||
- /* restore LRU_REFS_FLAGS cleared by isolate_page() */
|
||||
- if (PageWorkingset(page))
|
||||
- SetPageReferenced(page);
|
||||
+ list_for_each_entry_safe_reverse(page, next, &list, lru) {
|
||||
+ if (!page_evictable(page)) {
|
||||
+ list_del(&page->lru);
|
||||
+ putback_lru_page(page);
|
||||
+ continue;
|
||||
+ }
|
||||
|
||||
- /* don't add rejected pages to the oldest generation */
|
||||
if (PageReclaim(page) &&
|
||||
- (PageDirty(page) || PageWriteback(page)))
|
||||
- ClearPageActive(page);
|
||||
- else
|
||||
- SetPageActive(page);
|
||||
+ (PageDirty(page) || PageWriteback(page))) {
|
||||
+ /* restore LRU_REFS_FLAGS cleared by isolate_page() */
|
||||
+ if (PageWorkingset(page))
|
||||
+ SetPageReferenced(page);
|
||||
+ continue;
|
||||
+ }
|
||||
+
|
||||
+ if (skip_retry || PageActive(page) || PageReferenced(page) ||
|
||||
+ page_mapped(page) || PageLocked(page) ||
|
||||
+ PageDirty(page) || PageWriteback(page)) {
|
||||
+ /* don't add rejected pages to the oldest generation */
|
||||
+ set_mask_bits(&page->flags, LRU_REFS_MASK | LRU_REFS_FLAGS,
|
||||
+ BIT(PG_active));
|
||||
+ continue;
|
||||
+ }
|
||||
+
|
||||
+ /* retry pages that may have missed rotate_reclaimable_page() */
|
||||
+ list_move(&page->lru, &clean);
|
||||
+ sc->nr_scanned -= thp_nr_pages(page);
|
||||
}
|
||||
|
||||
spin_lock_irq(&lruvec->lru_lock);
|
||||
@@ -4778,7 +4798,13 @@ static int evict_pages(struct lruvec *lr
|
||||
mem_cgroup_uncharge_list(&list);
|
||||
free_unref_page_list(&list);
|
||||
|
||||
- sc->nr_reclaimed += reclaimed;
|
||||
+ INIT_LIST_HEAD(&list);
|
||||
+ list_splice_init(&clean, &list);
|
||||
+
|
||||
+ if (!list_empty(&list)) {
|
||||
+ skip_retry = true;
|
||||
+ goto retry;
|
||||
+ }
|
||||
|
||||
if (need_swapping && type == LRU_GEN_ANON)
|
||||
*need_swapping = true;
|
@ -0,0 +1,49 @@
|
||||
From 255bb0ac393f1c2818cd75af45a9226300ab3daf Mon Sep 17 00:00:00 2001
|
||||
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
|
||||
Date: Wed, 26 Oct 2022 15:48:30 +0200
|
||||
Subject: [PATCH 15/29] mm: multi-gen LRU: move lru_gen_add_mm() out of IRQ-off
|
||||
region
|
||||
|
||||
lru_gen_add_mm() has been added within an IRQ-off region in the commit
|
||||
mentioned below. The other invocations of lru_gen_add_mm() are not within
|
||||
an IRQ-off region.
|
||||
|
||||
The invocation within IRQ-off region is problematic on PREEMPT_RT because
|
||||
the function is using a spin_lock_t which must not be used within
|
||||
IRQ-disabled regions.
|
||||
|
||||
The other invocations of lru_gen_add_mm() occur while
|
||||
task_struct::alloc_lock is acquired. Move lru_gen_add_mm() after
|
||||
interrupts are enabled and before task_unlock().
|
||||
|
||||
Link: https://lkml.kernel.org/r/20221026134830.711887-1-bigeasy@linutronix.de
|
||||
Fixes: bd74fdaea1460 ("mm: multi-gen LRU: support page table walks")
|
||||
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
|
||||
Acked-by: Yu Zhao <yuzhao@google.com>
|
||||
Cc: Al Viro <viro@zeniv.linux.org.uk>
|
||||
Cc: "Eric W . Biederman" <ebiederm@xmission.com>
|
||||
Cc: Kees Cook <keescook@chromium.org>
|
||||
Cc: Thomas Gleixner <tglx@linutronix.de>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
fs/exec.c | 2 +-
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
--- a/fs/exec.c
|
||||
+++ b/fs/exec.c
|
||||
@@ -1013,7 +1013,6 @@ static int exec_mmap(struct mm_struct *m
|
||||
active_mm = tsk->active_mm;
|
||||
tsk->active_mm = mm;
|
||||
tsk->mm = mm;
|
||||
- lru_gen_add_mm(mm);
|
||||
/*
|
||||
* This prevents preemption while active_mm is being loaded and
|
||||
* it and mm are being updated, which could cause problems for
|
||||
@@ -1028,6 +1027,7 @@ static int exec_mmap(struct mm_struct *m
|
||||
local_irq_enable();
|
||||
tsk->mm->vmacache_seqnum = 0;
|
||||
vmacache_flush(tsk);
|
||||
+ lru_gen_add_mm(mm);
|
||||
task_unlock(tsk);
|
||||
lru_gen_use_mm(mm);
|
||||
if (old_mm) {
|
@ -0,0 +1,96 @@
|
||||
From c5ec455ebd2b488d91de9d8915a0c8036a2a04dd Mon Sep 17 00:00:00 2001
|
||||
From: Juergen Gross <jgross@suse.com>
|
||||
Date: Wed, 30 Nov 2022 14:49:41 -0800
|
||||
Subject: [PATCH 17/29] mm: add dummy pmd_young() for architectures not having
|
||||
it
|
||||
|
||||
In order to avoid #ifdeffery add a dummy pmd_young() implementation as a
|
||||
fallback. This is required for the later patch "mm: introduce
|
||||
arch_has_hw_nonleaf_pmd_young()".
|
||||
|
||||
Link: https://lkml.kernel.org/r/fd3ac3cd-7349-6bbd-890a-71a9454ca0b3@suse.com
|
||||
Signed-off-by: Juergen Gross <jgross@suse.com>
|
||||
Acked-by: Yu Zhao <yuzhao@google.com>
|
||||
Cc: Borislav Petkov <bp@alien8.de>
|
||||
Cc: Dave Hansen <dave.hansen@linux.intel.com>
|
||||
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
|
||||
Cc: "H. Peter Anvin" <hpa@zytor.com>
|
||||
Cc: Ingo Molnar <mingo@redhat.com>
|
||||
Cc: Sander Eikelenboom <linux@eikelenboom.it>
|
||||
Cc: Thomas Gleixner <tglx@linutronix.de>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
arch/mips/include/asm/pgtable.h | 1 +
|
||||
arch/riscv/include/asm/pgtable.h | 1 +
|
||||
arch/s390/include/asm/pgtable.h | 1 +
|
||||
arch/sparc/include/asm/pgtable_64.h | 1 +
|
||||
arch/x86/include/asm/pgtable.h | 1 +
|
||||
include/linux/pgtable.h | 7 +++++++
|
||||
6 files changed, 12 insertions(+)
|
||||
|
||||
--- a/arch/mips/include/asm/pgtable.h
|
||||
+++ b/arch/mips/include/asm/pgtable.h
|
||||
@@ -632,6 +632,7 @@ static inline pmd_t pmd_mkdirty(pmd_t pm
|
||||
return pmd;
|
||||
}
|
||||
|
||||
+#define pmd_young pmd_young
|
||||
static inline int pmd_young(pmd_t pmd)
|
||||
{
|
||||
return !!(pmd_val(pmd) & _PAGE_ACCESSED);
|
||||
--- a/arch/riscv/include/asm/pgtable.h
|
||||
+++ b/arch/riscv/include/asm/pgtable.h
|
||||
@@ -535,6 +535,7 @@ static inline int pmd_dirty(pmd_t pmd)
|
||||
return pte_dirty(pmd_pte(pmd));
|
||||
}
|
||||
|
||||
+#define pmd_young pmd_young
|
||||
static inline int pmd_young(pmd_t pmd)
|
||||
{
|
||||
return pte_young(pmd_pte(pmd));
|
||||
--- a/arch/s390/include/asm/pgtable.h
|
||||
+++ b/arch/s390/include/asm/pgtable.h
|
||||
@@ -748,6 +748,7 @@ static inline int pmd_dirty(pmd_t pmd)
|
||||
return (pmd_val(pmd) & _SEGMENT_ENTRY_DIRTY) != 0;
|
||||
}
|
||||
|
||||
+#define pmd_young pmd_young
|
||||
static inline int pmd_young(pmd_t pmd)
|
||||
{
|
||||
return (pmd_val(pmd) & _SEGMENT_ENTRY_YOUNG) != 0;
|
||||
--- a/arch/sparc/include/asm/pgtable_64.h
|
||||
+++ b/arch/sparc/include/asm/pgtable_64.h
|
||||
@@ -712,6 +712,7 @@ static inline unsigned long pmd_dirty(pm
|
||||
return pte_dirty(pte);
|
||||
}
|
||||
|
||||
+#define pmd_young pmd_young
|
||||
static inline unsigned long pmd_young(pmd_t pmd)
|
||||
{
|
||||
pte_t pte = __pte(pmd_val(pmd));
|
||||
--- a/arch/x86/include/asm/pgtable.h
|
||||
+++ b/arch/x86/include/asm/pgtable.h
|
||||
@@ -136,6 +136,7 @@ static inline int pmd_dirty(pmd_t pmd)
|
||||
return pmd_flags(pmd) & _PAGE_DIRTY;
|
||||
}
|
||||
|
||||
+#define pmd_young pmd_young
|
||||
static inline int pmd_young(pmd_t pmd)
|
||||
{
|
||||
return pmd_flags(pmd) & _PAGE_ACCESSED;
|
||||
--- a/include/linux/pgtable.h
|
||||
+++ b/include/linux/pgtable.h
|
||||
@@ -164,6 +164,13 @@ static inline pte_t *virt_to_kpte(unsign
|
||||
return pmd_none(*pmd) ? NULL : pte_offset_kernel(pmd, vaddr);
|
||||
}
|
||||
|
||||
+#ifndef pmd_young
|
||||
+static inline int pmd_young(pmd_t pmd)
|
||||
+{
|
||||
+ return 0;
|
||||
+}
|
||||
+#endif
|
||||
+
|
||||
#ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
|
||||
extern int ptep_set_access_flags(struct vm_area_struct *vma,
|
||||
unsigned long address, pte_t *ptep,
|
@ -0,0 +1,113 @@
|
||||
From 46cbda7b65998a5af4493f745d94417af697bd68 Mon Sep 17 00:00:00 2001
|
||||
From: Juergen Gross <jgross@suse.com>
|
||||
Date: Wed, 23 Nov 2022 07:45:10 +0100
|
||||
Subject: [PATCH 18/29] mm: introduce arch_has_hw_nonleaf_pmd_young()
|
||||
|
||||
When running as a Xen PV guests commit eed9a328aa1a ("mm: x86: add
|
||||
CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG") can cause a protection violation in
|
||||
pmdp_test_and_clear_young():
|
||||
|
||||
BUG: unable to handle page fault for address: ffff8880083374d0
|
||||
#PF: supervisor write access in kernel mode
|
||||
#PF: error_code(0x0003) - permissions violation
|
||||
PGD 3026067 P4D 3026067 PUD 3027067 PMD 7fee5067 PTE 8010000008337065
|
||||
Oops: 0003 [#1] PREEMPT SMP NOPTI
|
||||
CPU: 7 PID: 158 Comm: kswapd0 Not tainted 6.1.0-rc5-20221118-doflr+ #1
|
||||
RIP: e030:pmdp_test_and_clear_young+0x25/0x40
|
||||
|
||||
This happens because the Xen hypervisor can't emulate direct writes to
|
||||
page table entries other than PTEs.
|
||||
|
||||
This can easily be fixed by introducing arch_has_hw_nonleaf_pmd_young()
|
||||
similar to arch_has_hw_pte_young() and test that instead of
|
||||
CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG.
|
||||
|
||||
Link: https://lkml.kernel.org/r/20221123064510.16225-1-jgross@suse.com
|
||||
Fixes: eed9a328aa1a ("mm: x86: add CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG")
|
||||
Signed-off-by: Juergen Gross <jgross@suse.com>
|
||||
Reported-by: Sander Eikelenboom <linux@eikelenboom.it>
|
||||
Acked-by: Yu Zhao <yuzhao@google.com>
|
||||
Tested-by: Sander Eikelenboom <linux@eikelenboom.it>
|
||||
Acked-by: David Hildenbrand <david@redhat.com> [core changes]
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
arch/x86/include/asm/pgtable.h | 8 ++++++++
|
||||
include/linux/pgtable.h | 11 +++++++++++
|
||||
mm/vmscan.c | 10 +++++-----
|
||||
3 files changed, 24 insertions(+), 5 deletions(-)
|
||||
|
||||
--- a/arch/x86/include/asm/pgtable.h
|
||||
+++ b/arch/x86/include/asm/pgtable.h
|
||||
@@ -1405,6 +1405,14 @@ static inline bool arch_has_hw_pte_young
|
||||
return true;
|
||||
}
|
||||
|
||||
+#ifdef CONFIG_XEN_PV
|
||||
+#define arch_has_hw_nonleaf_pmd_young arch_has_hw_nonleaf_pmd_young
|
||||
+static inline bool arch_has_hw_nonleaf_pmd_young(void)
|
||||
+{
|
||||
+ return !cpu_feature_enabled(X86_FEATURE_XENPV);
|
||||
+}
|
||||
+#endif
|
||||
+
|
||||
#endif /* __ASSEMBLY__ */
|
||||
|
||||
#endif /* _ASM_X86_PGTABLE_H */
|
||||
--- a/include/linux/pgtable.h
|
||||
+++ b/include/linux/pgtable.h
|
||||
@@ -266,6 +266,17 @@ static inline int pmdp_clear_flush_young
|
||||
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
|
||||
#endif
|
||||
|
||||
+#ifndef arch_has_hw_nonleaf_pmd_young
|
||||
+/*
|
||||
+ * Return whether the accessed bit in non-leaf PMD entries is supported on the
|
||||
+ * local CPU.
|
||||
+ */
|
||||
+static inline bool arch_has_hw_nonleaf_pmd_young(void)
|
||||
+{
|
||||
+ return IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG);
|
||||
+}
|
||||
+#endif
|
||||
+
|
||||
#ifndef arch_has_hw_pte_young
|
||||
/*
|
||||
* Return whether the accessed bit is supported on the local CPU.
|
||||
--- a/mm/vmscan.c
|
||||
+++ b/mm/vmscan.c
|
||||
@@ -3727,7 +3727,7 @@ static void walk_pmd_range_locked(pud_t
|
||||
goto next;
|
||||
|
||||
if (!pmd_trans_huge(pmd[i])) {
|
||||
- if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) &&
|
||||
+ if (arch_has_hw_nonleaf_pmd_young() &&
|
||||
get_cap(LRU_GEN_NONLEAF_YOUNG))
|
||||
pmdp_test_and_clear_young(vma, addr, pmd + i);
|
||||
goto next;
|
||||
@@ -3825,14 +3825,14 @@ restart:
|
||||
#endif
|
||||
walk->mm_stats[MM_NONLEAF_TOTAL]++;
|
||||
|
||||
-#ifdef CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG
|
||||
- if (get_cap(LRU_GEN_NONLEAF_YOUNG)) {
|
||||
+ if (arch_has_hw_nonleaf_pmd_young() &&
|
||||
+ get_cap(LRU_GEN_NONLEAF_YOUNG)) {
|
||||
if (!pmd_young(val))
|
||||
continue;
|
||||
|
||||
walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos);
|
||||
}
|
||||
-#endif
|
||||
+
|
||||
if (!walk->force_scan && !test_bloom_filter(walk->lruvec, walk->max_seq, pmd + i))
|
||||
continue;
|
||||
|
||||
@@ -5132,7 +5132,7 @@ static ssize_t show_enabled(struct kobje
|
||||
if (arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK))
|
||||
caps |= BIT(LRU_GEN_MM_WALK);
|
||||
|
||||
- if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) && get_cap(LRU_GEN_NONLEAF_YOUNG))
|
||||
+ if (arch_has_hw_nonleaf_pmd_young() && get_cap(LRU_GEN_NONLEAF_YOUNG))
|
||||
caps |= BIT(LRU_GEN_NONLEAF_YOUNG);
|
||||
|
||||
return snprintf(buf, PAGE_SIZE, "0x%04x\n", caps);
|
@ -0,0 +1,56 @@
|
||||
From c7dfefd4bdfba3d5171038d1cc2d4160288e6ee4 Mon Sep 17 00:00:00 2001
|
||||
From: Yu Zhao <yuzhao@google.com>
|
||||
Date: Sun, 15 Jan 2023 20:44:05 -0700
|
||||
Subject: [PATCH 16/29] mm: multi-gen LRU: fix crash during cgroup migration
|
||||
|
||||
lru_gen_migrate_mm() assumes lru_gen_add_mm() runs prior to itself. This
|
||||
isn't true for the following scenario:
|
||||
|
||||
CPU 1 CPU 2
|
||||
|
||||
clone()
|
||||
cgroup_can_fork()
|
||||
cgroup_procs_write()
|
||||
cgroup_post_fork()
|
||||
task_lock()
|
||||
lru_gen_migrate_mm()
|
||||
task_unlock()
|
||||
task_lock()
|
||||
lru_gen_add_mm()
|
||||
task_unlock()
|
||||
|
||||
And when the above happens, kernel crashes because of linked list
|
||||
corruption (mm_struct->lru_gen.list).
|
||||
|
||||
Link: https://lore.kernel.org/r/20230115134651.30028-1-msizanoen@qtmlabs.xyz/
|
||||
Link: https://lkml.kernel.org/r/20230116034405.2960276-1-yuzhao@google.com
|
||||
Fixes: bd74fdaea146 ("mm: multi-gen LRU: support page table walks")
|
||||
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
||||
Reported-by: msizanoen <msizanoen@qtmlabs.xyz>
|
||||
Tested-by: msizanoen <msizanoen@qtmlabs.xyz>
|
||||
Cc: <stable@vger.kernel.org> [6.1+]
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
mm/vmscan.c | 5 ++++-
|
||||
1 file changed, 4 insertions(+), 1 deletion(-)
|
||||
|
||||
--- a/mm/vmscan.c
|
||||
+++ b/mm/vmscan.c
|
||||
@@ -3024,13 +3024,16 @@ void lru_gen_migrate_mm(struct mm_struct
|
||||
if (mem_cgroup_disabled())
|
||||
return;
|
||||
|
||||
+ /* migration can happen before addition */
|
||||
+ if (!mm->lru_gen.memcg)
|
||||
+ return;
|
||||
+
|
||||
rcu_read_lock();
|
||||
memcg = mem_cgroup_from_task(task);
|
||||
rcu_read_unlock();
|
||||
if (memcg == mm->lru_gen.memcg)
|
||||
return;
|
||||
|
||||
- VM_WARN_ON_ONCE(!mm->lru_gen.memcg);
|
||||
VM_WARN_ON_ONCE(list_empty(&mm->lru_gen.list));
|
||||
|
||||
lru_gen_del_mm(mm);
|
@ -0,0 +1,196 @@
|
||||
From 6c7f552a48b49a8612786a28a2239fbc24fac289 Mon Sep 17 00:00:00 2001
|
||||
From: Yu Zhao <yuzhao@google.com>
|
||||
Date: Fri, 30 Dec 2022 14:52:51 -0700
|
||||
Subject: [PATCH 19/29] mm: add vma_has_recency()
|
||||
|
||||
Add vma_has_recency() to indicate whether a VMA may exhibit temporal
|
||||
locality that the LRU algorithm relies on.
|
||||
|
||||
This function returns false for VMAs marked by VM_SEQ_READ or
|
||||
VM_RAND_READ. While the former flag indicates linear access, i.e., a
|
||||
special case of spatial locality, both flags indicate a lack of temporal
|
||||
locality, i.e., the reuse of an area within a relatively small duration.
|
||||
|
||||
"Recency" is chosen over "locality" to avoid confusion between temporal
|
||||
and spatial localities.
|
||||
|
||||
Before this patch, the active/inactive LRU only ignored the accessed bit
|
||||
from VMAs marked by VM_SEQ_READ. After this patch, the active/inactive
|
||||
LRU and MGLRU share the same logic: they both ignore the accessed bit if
|
||||
vma_has_recency() returns false.
|
||||
|
||||
For the active/inactive LRU, the following fio test showed a [6, 8]%
|
||||
increase in IOPS when randomly accessing mapped files under memory
|
||||
pressure.
|
||||
|
||||
kb=$(awk '/MemTotal/ { print $2 }' /proc/meminfo)
|
||||
kb=$((kb - 8*1024*1024))
|
||||
|
||||
modprobe brd rd_nr=1 rd_size=$kb
|
||||
dd if=/dev/zero of=/dev/ram0 bs=1M
|
||||
|
||||
mkfs.ext4 /dev/ram0
|
||||
mount /dev/ram0 /mnt/
|
||||
swapoff -a
|
||||
|
||||
fio --name=test --directory=/mnt/ --ioengine=mmap --numjobs=8 \
|
||||
--size=8G --rw=randrw --time_based --runtime=10m \
|
||||
--group_reporting
|
||||
|
||||
The discussion that led to this patch is here [1]. Additional test
|
||||
results are available in that thread.
|
||||
|
||||
[1] https://lore.kernel.org/r/Y31s%2FK8T85jh05wH@google.com/
|
||||
|
||||
Link: https://lkml.kernel.org/r/20221230215252.2628425-1-yuzhao@google.com
|
||||
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
||||
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
|
||||
Cc: Andrea Righi <andrea.righi@canonical.com>
|
||||
Cc: Johannes Weiner <hannes@cmpxchg.org>
|
||||
Cc: Michael Larabel <Michael@MichaelLarabel.com>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
include/linux/mm_inline.h | 9 +++++++++
|
||||
mm/memory.c | 8 ++++----
|
||||
mm/rmap.c | 42 +++++++++++++++++----------------------
|
||||
mm/vmscan.c | 5 ++++-
|
||||
4 files changed, 35 insertions(+), 29 deletions(-)
|
||||
|
||||
--- a/include/linux/mm_inline.h
|
||||
+++ b/include/linux/mm_inline.h
|
||||
@@ -333,4 +333,13 @@ static __always_inline void del_page_fro
|
||||
update_lru_size(lruvec, page_lru(page), page_zonenum(page),
|
||||
-thp_nr_pages(page));
|
||||
}
|
||||
+
|
||||
+static inline bool vma_has_recency(struct vm_area_struct *vma)
|
||||
+{
|
||||
+ if (vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ))
|
||||
+ return false;
|
||||
+
|
||||
+ return true;
|
||||
+}
|
||||
+
|
||||
#endif
|
||||
--- a/mm/memory.c
|
||||
+++ b/mm/memory.c
|
||||
@@ -41,6 +41,7 @@
|
||||
|
||||
#include <linux/kernel_stat.h>
|
||||
#include <linux/mm.h>
|
||||
+#include <linux/mm_inline.h>
|
||||
#include <linux/sched/mm.h>
|
||||
#include <linux/sched/coredump.h>
|
||||
#include <linux/sched/numa_balancing.h>
|
||||
@@ -1353,8 +1354,7 @@ again:
|
||||
force_flush = 1;
|
||||
set_page_dirty(page);
|
||||
}
|
||||
- if (pte_young(ptent) &&
|
||||
- likely(!(vma->vm_flags & VM_SEQ_READ)))
|
||||
+ if (pte_young(ptent) && likely(vma_has_recency(vma)))
|
||||
mark_page_accessed(page);
|
||||
}
|
||||
rss[mm_counter(page)]--;
|
||||
@@ -4795,8 +4795,8 @@ static inline void mm_account_fault(stru
|
||||
#ifdef CONFIG_LRU_GEN
|
||||
static void lru_gen_enter_fault(struct vm_area_struct *vma)
|
||||
{
|
||||
- /* the LRU algorithm doesn't apply to sequential or random reads */
|
||||
- current->in_lru_fault = !(vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ));
|
||||
+ /* the LRU algorithm only applies to accesses with recency */
|
||||
+ current->in_lru_fault = vma_has_recency(vma);
|
||||
}
|
||||
|
||||
static void lru_gen_exit_fault(void)
|
||||
--- a/mm/rmap.c
|
||||
+++ b/mm/rmap.c
|
||||
@@ -794,25 +794,14 @@ static bool page_referenced_one(struct p
|
||||
}
|
||||
|
||||
if (pvmw.pte) {
|
||||
- if (lru_gen_enabled() && pte_young(*pvmw.pte) &&
|
||||
- !(vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ))) {
|
||||
+ if (lru_gen_enabled() && pte_young(*pvmw.pte)) {
|
||||
lru_gen_look_around(&pvmw);
|
||||
referenced++;
|
||||
}
|
||||
|
||||
if (ptep_clear_flush_young_notify(vma, address,
|
||||
- pvmw.pte)) {
|
||||
- /*
|
||||
- * Don't treat a reference through
|
||||
- * a sequentially read mapping as such.
|
||||
- * If the page has been used in another mapping,
|
||||
- * we will catch it; if this other mapping is
|
||||
- * already gone, the unmap path will have set
|
||||
- * PG_referenced or activated the page.
|
||||
- */
|
||||
- if (likely(!(vma->vm_flags & VM_SEQ_READ)))
|
||||
- referenced++;
|
||||
- }
|
||||
+ pvmw.pte))
|
||||
+ referenced++;
|
||||
} else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
|
||||
if (pmdp_clear_flush_young_notify(vma, address,
|
||||
pvmw.pmd))
|
||||
@@ -846,7 +835,20 @@ static bool invalid_page_referenced_vma(
|
||||
struct page_referenced_arg *pra = arg;
|
||||
struct mem_cgroup *memcg = pra->memcg;
|
||||
|
||||
- if (!mm_match_cgroup(vma->vm_mm, memcg))
|
||||
+ /*
|
||||
+ * Ignore references from this mapping if it has no recency. If the
|
||||
+ * page has been used in another mapping, we will catch it; if this
|
||||
+ * other mapping is already gone, the unmap path will have set the
|
||||
+ * referenced flag or activated the page in zap_pte_range().
|
||||
+ */
|
||||
+ if (!vma_has_recency(vma))
|
||||
+ return true;
|
||||
+
|
||||
+ /*
|
||||
+ * If we are reclaiming on behalf of a cgroup, skip counting on behalf
|
||||
+ * of references from different cgroups.
|
||||
+ */
|
||||
+ if (memcg && !mm_match_cgroup(vma->vm_mm, memcg))
|
||||
return true;
|
||||
|
||||
return false;
|
||||
@@ -876,6 +878,7 @@ int page_referenced(struct page *page,
|
||||
.rmap_one = page_referenced_one,
|
||||
.arg = (void *)&pra,
|
||||
.anon_lock = page_lock_anon_vma_read,
|
||||
+ .invalid_vma = invalid_page_referenced_vma,
|
||||
};
|
||||
|
||||
*vm_flags = 0;
|
||||
@@ -891,15 +894,6 @@ int page_referenced(struct page *page,
|
||||
return 1;
|
||||
}
|
||||
|
||||
- /*
|
||||
- * If we are reclaiming on behalf of a cgroup, skip
|
||||
- * counting on behalf of references from different
|
||||
- * cgroups
|
||||
- */
|
||||
- if (memcg) {
|
||||
- rwc.invalid_vma = invalid_page_referenced_vma;
|
||||
- }
|
||||
-
|
||||
rmap_walk(page, &rwc);
|
||||
*vm_flags = pra.vm_flags;
|
||||
|
||||
--- a/mm/vmscan.c
|
||||
+++ b/mm/vmscan.c
|
||||
@@ -3486,7 +3486,10 @@ static int should_skip_vma(unsigned long
|
||||
if (is_vm_hugetlb_page(vma))
|
||||
return true;
|
||||
|
||||
- if (vma->vm_flags & (VM_LOCKED | VM_SPECIAL | VM_SEQ_READ | VM_RAND_READ))
|
||||
+ if (!vma_has_recency(vma))
|
||||
+ return true;
|
||||
+
|
||||
+ if (vma->vm_flags & (VM_LOCKED | VM_SPECIAL))
|
||||
return true;
|
||||
|
||||
if (vma == get_gate_vma(vma->vm_mm))
|
@ -0,0 +1,125 @@
|
||||
From 686c3d4f71de9e0e7a27f03a5617a712385f90cd Mon Sep 17 00:00:00 2001
|
||||
From: Yu Zhao <yuzhao@google.com>
|
||||
Date: Fri, 30 Dec 2022 14:52:52 -0700
|
||||
Subject: [PATCH 20/29] mm: support POSIX_FADV_NOREUSE
|
||||
|
||||
This patch adds POSIX_FADV_NOREUSE to vma_has_recency() so that the LRU
|
||||
algorithm can ignore access to mapped files marked by this flag.
|
||||
|
||||
The advantages of POSIX_FADV_NOREUSE are:
|
||||
1. Unlike MADV_SEQUENTIAL and MADV_RANDOM, it does not alter the
|
||||
default readahead behavior.
|
||||
2. Unlike MADV_SEQUENTIAL and MADV_RANDOM, it does not split VMAs and
|
||||
therefore does not take mmap_lock.
|
||||
3. Unlike MADV_COLD, setting it has a negligible cost, regardless of
|
||||
how many pages it affects.
|
||||
|
||||
Its limitations are:
|
||||
1. Like POSIX_FADV_RANDOM and POSIX_FADV_SEQUENTIAL, it currently does
|
||||
not support range. IOW, its scope is the entire file.
|
||||
2. It currently does not ignore access through file descriptors.
|
||||
Specifically, for the active/inactive LRU, given a file page shared
|
||||
by two users and one of them having set POSIX_FADV_NOREUSE on the
|
||||
file, this page will be activated upon the second user accessing
|
||||
it. This corner case can be covered by checking POSIX_FADV_NOREUSE
|
||||
before calling mark_page_accessed() on the read path. But it is
|
||||
considered not worth the effort.
|
||||
|
||||
There have been a few attempts to support POSIX_FADV_NOREUSE, e.g., [1].
|
||||
This time the goal is to fill a niche: a few desktop applications, e.g.,
|
||||
large file transferring and video encoding/decoding, want fast file
|
||||
streaming with mmap() rather than direct IO. Among those applications, an
|
||||
SVT-AV1 regression was reported when running with MGLRU [2]. The
|
||||
following test can reproduce that regression.
|
||||
|
||||
kb=$(awk '/MemTotal/ { print $2 }' /proc/meminfo)
|
||||
kb=$((kb - 8*1024*1024))
|
||||
|
||||
modprobe brd rd_nr=1 rd_size=$kb
|
||||
dd if=/dev/zero of=/dev/ram0 bs=1M
|
||||
|
||||
mkfs.ext4 /dev/ram0
|
||||
mount /dev/ram0 /mnt/
|
||||
swapoff -a
|
||||
|
||||
fallocate -l 8G /mnt/swapfile
|
||||
mkswap /mnt/swapfile
|
||||
swapon /mnt/swapfile
|
||||
|
||||
wget http://ultravideo.cs.tut.fi/video/Bosphorus_3840x2160_120fps_420_8bit_YUV_Y4M.7z
|
||||
7z e -o/mnt/ Bosphorus_3840x2160_120fps_420_8bit_YUV_Y4M.7z
|
||||
SvtAv1EncApp --preset 12 -w 3840 -h 2160 \
|
||||
-i /mnt/Bosphorus_3840x2160.y4m
|
||||
|
||||
For MGLRU, the following change showed a [9-11]% increase in FPS,
|
||||
which makes it on par with the active/inactive LRU.
|
||||
|
||||
patch Source/App/EncApp/EbAppMain.c <<EOF
|
||||
31a32
|
||||
> #include <fcntl.h>
|
||||
35d35
|
||||
< #include <fcntl.h> /* _O_BINARY */
|
||||
117a118
|
||||
> posix_fadvise(config->mmap.fd, 0, 0, POSIX_FADV_NOREUSE);
|
||||
EOF
|
||||
|
||||
[1] https://lore.kernel.org/r/1308923350-7932-1-git-send-email-andrea@betterlinux.com/
|
||||
[2] https://openbenchmarking.org/result/2209259-PTS-MGLRU8GB57
|
||||
|
||||
Link: https://lkml.kernel.org/r/20221230215252.2628425-2-yuzhao@google.com
|
||||
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
||||
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
|
||||
Cc: Andrea Righi <andrea.righi@canonical.com>
|
||||
Cc: Johannes Weiner <hannes@cmpxchg.org>
|
||||
Cc: Michael Larabel <Michael@MichaelLarabel.com>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
include/linux/fs.h | 2 ++
|
||||
include/linux/mm_inline.h | 3 +++
|
||||
mm/fadvise.c | 5 ++++-
|
||||
3 files changed, 9 insertions(+), 1 deletion(-)
|
||||
|
||||
--- a/include/linux/fs.h
|
||||
+++ b/include/linux/fs.h
|
||||
@@ -167,6 +167,8 @@ typedef int (dio_iodone_t)(struct kiocb
|
||||
/* File is stream-like */
|
||||
#define FMODE_STREAM ((__force fmode_t)0x200000)
|
||||
|
||||
+#define FMODE_NOREUSE ((__force fmode_t)0x400000)
|
||||
+
|
||||
/* File was opened by fanotify and shouldn't generate fanotify events */
|
||||
#define FMODE_NONOTIFY ((__force fmode_t)0x4000000)
|
||||
|
||||
--- a/include/linux/mm_inline.h
|
||||
+++ b/include/linux/mm_inline.h
|
||||
@@ -339,6 +339,9 @@ static inline bool vma_has_recency(struc
|
||||
if (vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ))
|
||||
return false;
|
||||
|
||||
+ if (vma->vm_file && (vma->vm_file->f_mode & FMODE_NOREUSE))
|
||||
+ return false;
|
||||
+
|
||||
return true;
|
||||
}
|
||||
|
||||
--- a/mm/fadvise.c
|
||||
+++ b/mm/fadvise.c
|
||||
@@ -80,7 +80,7 @@ int generic_fadvise(struct file *file, l
|
||||
case POSIX_FADV_NORMAL:
|
||||
file->f_ra.ra_pages = bdi->ra_pages;
|
||||
spin_lock(&file->f_lock);
|
||||
- file->f_mode &= ~FMODE_RANDOM;
|
||||
+ file->f_mode &= ~(FMODE_RANDOM | FMODE_NOREUSE);
|
||||
spin_unlock(&file->f_lock);
|
||||
break;
|
||||
case POSIX_FADV_RANDOM:
|
||||
@@ -107,6 +107,9 @@ int generic_fadvise(struct file *file, l
|
||||
force_page_cache_readahead(mapping, file, start_index, nrpages);
|
||||
break;
|
||||
case POSIX_FADV_NOREUSE:
|
||||
+ spin_lock(&file->f_lock);
|
||||
+ file->f_mode |= FMODE_NOREUSE;
|
||||
+ spin_unlock(&file->f_lock);
|
||||
break;
|
||||
case POSIX_FADV_DONTNEED:
|
||||
if (!inode_write_congested(mapping->host))
|
@ -0,0 +1,348 @@
|
||||
From 348fdbada9fb3f0bf1a53651be46319105af187f Mon Sep 17 00:00:00 2001
|
||||
From: Yu Zhao <yuzhao@google.com>
|
||||
Date: Wed, 21 Dec 2022 21:18:59 -0700
|
||||
Subject: [PATCH 21/29] mm: multi-gen LRU: rename lru_gen_struct to
|
||||
lru_gen_page
|
||||
|
||||
Patch series "mm: multi-gen LRU: memcg LRU", v3.
|
||||
|
||||
Overview
|
||||
========
|
||||
|
||||
An memcg LRU is a per-node LRU of memcgs. It is also an LRU of LRUs,
|
||||
since each node and memcg combination has an LRU of pages (see
|
||||
mem_cgroup_lruvec()).
|
||||
|
||||
Its goal is to improve the scalability of global reclaim, which is
|
||||
critical to system-wide memory overcommit in data centers. Note that
|
||||
memcg reclaim is currently out of scope.
|
||||
|
||||
Its memory bloat is a pointer to each lruvec and negligible to each
|
||||
pglist_data. In terms of traversing memcgs during global reclaim, it
|
||||
improves the best-case complexity from O(n) to O(1) and does not affect
|
||||
the worst-case complexity O(n). Therefore, on average, it has a sublinear
|
||||
complexity in contrast to the current linear complexity.
|
||||
|
||||
The basic structure of an memcg LRU can be understood by an analogy to
|
||||
the active/inactive LRU (of pages):
|
||||
1. It has the young and the old (generations), i.e., the counterparts
|
||||
to the active and the inactive;
|
||||
2. The increment of max_seq triggers promotion, i.e., the counterpart
|
||||
to activation;
|
||||
3. Other events trigger similar operations, e.g., offlining an memcg
|
||||
triggers demotion, i.e., the counterpart to deactivation.
|
||||
|
||||
In terms of global reclaim, it has two distinct features:
|
||||
1. Sharding, which allows each thread to start at a random memcg (in
|
||||
the old generation) and improves parallelism;
|
||||
2. Eventual fairness, which allows direct reclaim to bail out at will
|
||||
and reduces latency without affecting fairness over some time.
|
||||
|
||||
The commit message in patch 6 details the workflow:
|
||||
https://lore.kernel.org/r/20221222041905.2431096-7-yuzhao@google.com/
|
||||
|
||||
The following is a simple test to quickly verify its effectiveness.
|
||||
|
||||
Test design:
|
||||
1. Create multiple memcgs.
|
||||
2. Each memcg contains a job (fio).
|
||||
3. All jobs access the same amount of memory randomly.
|
||||
4. The system does not experience global memory pressure.
|
||||
5. Periodically write to the root memory.reclaim.
|
||||
|
||||
Desired outcome:
|
||||
1. All memcgs have similar pgsteal counts, i.e., stddev(pgsteal)
|
||||
over mean(pgsteal) is close to 0%.
|
||||
2. The total pgsteal is close to the total requested through
|
||||
memory.reclaim, i.e., sum(pgsteal) over sum(requested) is close
|
||||
to 100%.
|
||||
|
||||
Actual outcome [1]:
|
||||
MGLRU off MGLRU on
|
||||
stddev(pgsteal) / mean(pgsteal) 75% 20%
|
||||
sum(pgsteal) / sum(requested) 425% 95%
|
||||
|
||||
####################################################################
|
||||
MEMCGS=128
|
||||
|
||||
for ((memcg = 0; memcg < $MEMCGS; memcg++)); do
|
||||
mkdir /sys/fs/cgroup/memcg$memcg
|
||||
done
|
||||
|
||||
start() {
|
||||
echo $BASHPID > /sys/fs/cgroup/memcg$memcg/cgroup.procs
|
||||
|
||||
fio -name=memcg$memcg --numjobs=1 --ioengine=mmap \
|
||||
--filename=/dev/zero --size=1920M --rw=randrw \
|
||||
--rate=64m,64m --random_distribution=random \
|
||||
--fadvise_hint=0 --time_based --runtime=10h \
|
||||
--group_reporting --minimal
|
||||
}
|
||||
|
||||
for ((memcg = 0; memcg < $MEMCGS; memcg++)); do
|
||||
start &
|
||||
done
|
||||
|
||||
sleep 600
|
||||
|
||||
for ((i = 0; i < 600; i++)); do
|
||||
echo 256m >/sys/fs/cgroup/memory.reclaim
|
||||
sleep 6
|
||||
done
|
||||
|
||||
for ((memcg = 0; memcg < $MEMCGS; memcg++)); do
|
||||
grep "pgsteal " /sys/fs/cgroup/memcg$memcg/memory.stat
|
||||
done
|
||||
####################################################################
|
||||
|
||||
[1]: This was obtained from running the above script (touches less
|
||||
than 256GB memory) on an EPYC 7B13 with 512GB DRAM for over an
|
||||
hour.
|
||||
|
||||
This patch (of 8):
|
||||
|
||||
The new name lru_gen_page will be more distinct from the coming
|
||||
lru_gen_memcg.
|
||||
|
||||
Link: https://lkml.kernel.org/r/20221222041905.2431096-1-yuzhao@google.com
|
||||
Link: https://lkml.kernel.org/r/20221222041905.2431096-2-yuzhao@google.com
|
||||
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
||||
Cc: Johannes Weiner <hannes@cmpxchg.org>
|
||||
Cc: Jonathan Corbet <corbet@lwn.net>
|
||||
Cc: Michael Larabel <Michael@MichaelLarabel.com>
|
||||
Cc: Michal Hocko <mhocko@kernel.org>
|
||||
Cc: Mike Rapoport <rppt@kernel.org>
|
||||
Cc: Roman Gushchin <roman.gushchin@linux.dev>
|
||||
Cc: Suren Baghdasaryan <surenb@google.com>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
include/linux/mm_inline.h | 4 ++--
|
||||
include/linux/mmzone.h | 6 +++---
|
||||
mm/vmscan.c | 34 +++++++++++++++++-----------------
|
||||
mm/workingset.c | 4 ++--
|
||||
4 files changed, 24 insertions(+), 24 deletions(-)
|
||||
|
||||
--- a/include/linux/mm_inline.h
|
||||
+++ b/include/linux/mm_inline.h
|
||||
@@ -168,7 +168,7 @@ static inline void lru_gen_update_size(s
|
||||
int zone = page_zonenum(page);
|
||||
int delta = thp_nr_pages(page);
|
||||
enum lru_list lru = type * LRU_INACTIVE_FILE;
|
||||
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
||||
+ struct lru_gen_page *lrugen = &lruvec->lrugen;
|
||||
|
||||
VM_WARN_ON_ONCE(old_gen != -1 && old_gen >= MAX_NR_GENS);
|
||||
VM_WARN_ON_ONCE(new_gen != -1 && new_gen >= MAX_NR_GENS);
|
||||
@@ -214,7 +214,7 @@ static inline bool lru_gen_add_page(stru
|
||||
int gen = page_lru_gen(page);
|
||||
int type = page_is_file_lru(page);
|
||||
int zone = page_zonenum(page);
|
||||
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
||||
+ struct lru_gen_page *lrugen = &lruvec->lrugen;
|
||||
|
||||
VM_WARN_ON_ONCE_PAGE(gen != -1, page);
|
||||
|
||||
--- a/include/linux/mmzone.h
|
||||
+++ b/include/linux/mmzone.h
|
||||
@@ -394,7 +394,7 @@ enum {
|
||||
* The number of pages in each generation is eventually consistent and therefore
|
||||
* can be transiently negative when reset_batch_size() is pending.
|
||||
*/
|
||||
-struct lru_gen_struct {
|
||||
+struct lru_gen_page {
|
||||
/* the aging increments the youngest generation number */
|
||||
unsigned long max_seq;
|
||||
/* the eviction increments the oldest generation numbers */
|
||||
@@ -451,7 +451,7 @@ struct lru_gen_mm_state {
|
||||
struct lru_gen_mm_walk {
|
||||
/* the lruvec under reclaim */
|
||||
struct lruvec *lruvec;
|
||||
- /* unstable max_seq from lru_gen_struct */
|
||||
+ /* unstable max_seq from lru_gen_page */
|
||||
unsigned long max_seq;
|
||||
/* the next address within an mm to scan */
|
||||
unsigned long next_addr;
|
||||
@@ -514,7 +514,7 @@ struct lruvec {
|
||||
unsigned long flags;
|
||||
#ifdef CONFIG_LRU_GEN
|
||||
/* evictable pages divided into generations */
|
||||
- struct lru_gen_struct lrugen;
|
||||
+ struct lru_gen_page lrugen;
|
||||
/* to concurrently iterate lru_gen_mm_list */
|
||||
struct lru_gen_mm_state mm_state;
|
||||
#endif
|
||||
--- a/mm/vmscan.c
|
||||
+++ b/mm/vmscan.c
|
||||
@@ -2910,7 +2910,7 @@ static int get_nr_gens(struct lruvec *lr
|
||||
|
||||
static bool __maybe_unused seq_is_valid(struct lruvec *lruvec)
|
||||
{
|
||||
- /* see the comment on lru_gen_struct */
|
||||
+ /* see the comment on lru_gen_page */
|
||||
return get_nr_gens(lruvec, LRU_GEN_FILE) >= MIN_NR_GENS &&
|
||||
get_nr_gens(lruvec, LRU_GEN_FILE) <= get_nr_gens(lruvec, LRU_GEN_ANON) &&
|
||||
get_nr_gens(lruvec, LRU_GEN_ANON) <= MAX_NR_GENS;
|
||||
@@ -3316,7 +3316,7 @@ struct ctrl_pos {
|
||||
static void read_ctrl_pos(struct lruvec *lruvec, int type, int tier, int gain,
|
||||
struct ctrl_pos *pos)
|
||||
{
|
||||
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
||||
+ struct lru_gen_page *lrugen = &lruvec->lrugen;
|
||||
int hist = lru_hist_from_seq(lrugen->min_seq[type]);
|
||||
|
||||
pos->refaulted = lrugen->avg_refaulted[type][tier] +
|
||||
@@ -3331,7 +3331,7 @@ static void read_ctrl_pos(struct lruvec
|
||||
static void reset_ctrl_pos(struct lruvec *lruvec, int type, bool carryover)
|
||||
{
|
||||
int hist, tier;
|
||||
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
||||
+ struct lru_gen_page *lrugen = &lruvec->lrugen;
|
||||
bool clear = carryover ? NR_HIST_GENS == 1 : NR_HIST_GENS > 1;
|
||||
unsigned long seq = carryover ? lrugen->min_seq[type] : lrugen->max_seq + 1;
|
||||
|
||||
@@ -3408,7 +3408,7 @@ static int page_update_gen(struct page *
|
||||
static int page_inc_gen(struct lruvec *lruvec, struct page *page, bool reclaiming)
|
||||
{
|
||||
int type = page_is_file_lru(page);
|
||||
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
||||
+ struct lru_gen_page *lrugen = &lruvec->lrugen;
|
||||
int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]);
|
||||
unsigned long new_flags, old_flags = READ_ONCE(page->flags);
|
||||
|
||||
@@ -3453,7 +3453,7 @@ static void update_batch_size(struct lru
|
||||
static void reset_batch_size(struct lruvec *lruvec, struct lru_gen_mm_walk *walk)
|
||||
{
|
||||
int gen, type, zone;
|
||||
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
||||
+ struct lru_gen_page *lrugen = &lruvec->lrugen;
|
||||
|
||||
walk->batched = 0;
|
||||
|
||||
@@ -3979,7 +3979,7 @@ static bool inc_min_seq(struct lruvec *l
|
||||
{
|
||||
int zone;
|
||||
int remaining = MAX_LRU_BATCH;
|
||||
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
||||
+ struct lru_gen_page *lrugen = &lruvec->lrugen;
|
||||
int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]);
|
||||
|
||||
if (type == LRU_GEN_ANON && !can_swap)
|
||||
@@ -4015,7 +4015,7 @@ static bool try_to_inc_min_seq(struct lr
|
||||
{
|
||||
int gen, type, zone;
|
||||
bool success = false;
|
||||
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
||||
+ struct lru_gen_page *lrugen = &lruvec->lrugen;
|
||||
DEFINE_MIN_SEQ(lruvec);
|
||||
|
||||
VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
|
||||
@@ -4036,7 +4036,7 @@ next:
|
||||
;
|
||||
}
|
||||
|
||||
- /* see the comment on lru_gen_struct */
|
||||
+ /* see the comment on lru_gen_page */
|
||||
if (can_swap) {
|
||||
min_seq[LRU_GEN_ANON] = min(min_seq[LRU_GEN_ANON], min_seq[LRU_GEN_FILE]);
|
||||
min_seq[LRU_GEN_FILE] = max(min_seq[LRU_GEN_ANON], lrugen->min_seq[LRU_GEN_FILE]);
|
||||
@@ -4058,7 +4058,7 @@ static void inc_max_seq(struct lruvec *l
|
||||
{
|
||||
int prev, next;
|
||||
int type, zone;
|
||||
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
||||
+ struct lru_gen_page *lrugen = &lruvec->lrugen;
|
||||
|
||||
spin_lock_irq(&lruvec->lru_lock);
|
||||
|
||||
@@ -4116,7 +4116,7 @@ static bool try_to_inc_max_seq(struct lr
|
||||
bool success;
|
||||
struct lru_gen_mm_walk *walk;
|
||||
struct mm_struct *mm = NULL;
|
||||
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
||||
+ struct lru_gen_page *lrugen = &lruvec->lrugen;
|
||||
|
||||
VM_WARN_ON_ONCE(max_seq > READ_ONCE(lrugen->max_seq));
|
||||
|
||||
@@ -4181,7 +4181,7 @@ static bool should_run_aging(struct lruv
|
||||
unsigned long old = 0;
|
||||
unsigned long young = 0;
|
||||
unsigned long total = 0;
|
||||
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
||||
+ struct lru_gen_page *lrugen = &lruvec->lrugen;
|
||||
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
|
||||
|
||||
for (type = !can_swap; type < ANON_AND_FILE; type++) {
|
||||
@@ -4466,7 +4466,7 @@ static bool sort_page(struct lruvec *lru
|
||||
int delta = thp_nr_pages(page);
|
||||
int refs = page_lru_refs(page);
|
||||
int tier = lru_tier_from_refs(refs);
|
||||
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
||||
+ struct lru_gen_page *lrugen = &lruvec->lrugen;
|
||||
|
||||
VM_WARN_ON_ONCE_PAGE(gen >= MAX_NR_GENS, page);
|
||||
|
||||
@@ -4566,7 +4566,7 @@ static int scan_pages(struct lruvec *lru
|
||||
int scanned = 0;
|
||||
int isolated = 0;
|
||||
int remaining = MAX_LRU_BATCH;
|
||||
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
||||
+ struct lru_gen_page *lrugen = &lruvec->lrugen;
|
||||
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
|
||||
|
||||
VM_WARN_ON_ONCE(!list_empty(list));
|
||||
@@ -4967,7 +4967,7 @@ done:
|
||||
|
||||
static bool __maybe_unused state_is_valid(struct lruvec *lruvec)
|
||||
{
|
||||
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
||||
+ struct lru_gen_page *lrugen = &lruvec->lrugen;
|
||||
|
||||
if (lrugen->enabled) {
|
||||
enum lru_list lru;
|
||||
@@ -5247,7 +5247,7 @@ static void lru_gen_seq_show_full(struct
|
||||
int i;
|
||||
int type, tier;
|
||||
int hist = lru_hist_from_seq(seq);
|
||||
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
||||
+ struct lru_gen_page *lrugen = &lruvec->lrugen;
|
||||
|
||||
for (tier = 0; tier < MAX_NR_TIERS; tier++) {
|
||||
seq_printf(m, " %10d", tier);
|
||||
@@ -5296,7 +5296,7 @@ static int lru_gen_seq_show(struct seq_f
|
||||
unsigned long seq;
|
||||
bool full = !debugfs_real_fops(m->file)->write;
|
||||
struct lruvec *lruvec = v;
|
||||
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
||||
+ struct lru_gen_page *lrugen = &lruvec->lrugen;
|
||||
int nid = lruvec_pgdat(lruvec)->node_id;
|
||||
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
|
||||
DEFINE_MAX_SEQ(lruvec);
|
||||
@@ -5549,7 +5549,7 @@ void lru_gen_init_lruvec(struct lruvec *
|
||||
{
|
||||
int i;
|
||||
int gen, type, zone;
|
||||
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
||||
+ struct lru_gen_page *lrugen = &lruvec->lrugen;
|
||||
|
||||
lrugen->max_seq = MIN_NR_GENS + 1;
|
||||
lrugen->enabled = lru_gen_enabled();
|
||||
--- a/mm/workingset.c
|
||||
+++ b/mm/workingset.c
|
||||
@@ -223,7 +223,7 @@ static void *lru_gen_eviction(struct pag
|
||||
unsigned long token;
|
||||
unsigned long min_seq;
|
||||
struct lruvec *lruvec;
|
||||
- struct lru_gen_struct *lrugen;
|
||||
+ struct lru_gen_page *lrugen;
|
||||
int type = page_is_file_lru(page);
|
||||
int delta = thp_nr_pages(page);
|
||||
int refs = page_lru_refs(page);
|
||||
@@ -252,7 +252,7 @@ static void lru_gen_refault(struct page
|
||||
unsigned long token;
|
||||
unsigned long min_seq;
|
||||
struct lruvec *lruvec;
|
||||
- struct lru_gen_struct *lrugen;
|
||||
+ struct lru_gen_page *lrugen;
|
||||
struct mem_cgroup *memcg;
|
||||
struct pglist_data *pgdat;
|
||||
int type = page_is_file_lru(page);
|
@ -0,0 +1,162 @@
|
||||
From afd37e73db04c7e6b47411120ac5f6a7eca51fec Mon Sep 17 00:00:00 2001
|
||||
From: Yu Zhao <yuzhao@google.com>
|
||||
Date: Wed, 21 Dec 2022 21:19:00 -0700
|
||||
Subject: [PATCH 22/29] mm: multi-gen LRU: rename lrugen->lists[] to
|
||||
lrugen->pages[]
|
||||
|
||||
lru_gen_page will be chained into per-node lists by the coming
|
||||
lrugen->list.
|
||||
|
||||
Link: https://lkml.kernel.org/r/20221222041905.2431096-3-yuzhao@google.com
|
||||
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
||||
Cc: Johannes Weiner <hannes@cmpxchg.org>
|
||||
Cc: Jonathan Corbet <corbet@lwn.net>
|
||||
Cc: Michael Larabel <Michael@MichaelLarabel.com>
|
||||
Cc: Michal Hocko <mhocko@kernel.org>
|
||||
Cc: Mike Rapoport <rppt@kernel.org>
|
||||
Cc: Roman Gushchin <roman.gushchin@linux.dev>
|
||||
Cc: Suren Baghdasaryan <surenb@google.com>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
include/linux/mm_inline.h | 4 ++--
|
||||
include/linux/mmzone.h | 8 ++++----
|
||||
mm/vmscan.c | 20 ++++++++++----------
|
||||
3 files changed, 16 insertions(+), 16 deletions(-)
|
||||
|
||||
--- a/include/linux/mm_inline.h
|
||||
+++ b/include/linux/mm_inline.h
|
||||
@@ -246,9 +246,9 @@ static inline bool lru_gen_add_page(stru
|
||||
lru_gen_update_size(lruvec, page, -1, gen);
|
||||
/* for rotate_reclaimable_page() */
|
||||
if (reclaiming)
|
||||
- list_add_tail(&page->lru, &lrugen->lists[gen][type][zone]);
|
||||
+ list_add_tail(&page->lru, &lrugen->pages[gen][type][zone]);
|
||||
else
|
||||
- list_add(&page->lru, &lrugen->lists[gen][type][zone]);
|
||||
+ list_add(&page->lru, &lrugen->pages[gen][type][zone]);
|
||||
|
||||
return true;
|
||||
}
|
||||
--- a/include/linux/mmzone.h
|
||||
+++ b/include/linux/mmzone.h
|
||||
@@ -302,7 +302,7 @@ enum lruvec_flags {
|
||||
* They form a sliding window of a variable size [MIN_NR_GENS, MAX_NR_GENS]. An
|
||||
* offset within MAX_NR_GENS, i.e., gen, indexes the LRU list of the
|
||||
* corresponding generation. The gen counter in page->flags stores gen+1 while
|
||||
- * a page is on one of lrugen->lists[]. Otherwise it stores 0.
|
||||
+ * a page is on one of lrugen->pages[]. Otherwise it stores 0.
|
||||
*
|
||||
* A page is added to the youngest generation on faulting. The aging needs to
|
||||
* check the accessed bit at least twice before handing this page over to the
|
||||
@@ -314,8 +314,8 @@ enum lruvec_flags {
|
||||
* rest of generations, if they exist, are considered inactive. See
|
||||
* lru_gen_is_active().
|
||||
*
|
||||
- * PG_active is always cleared while a page is on one of lrugen->lists[] so that
|
||||
- * the aging needs not to worry about it. And it's set again when a page
|
||||
+ * PG_active is always cleared while a page is on one of lrugen->pages[] so
|
||||
+ * that the aging needs not to worry about it. And it's set again when a page
|
||||
* considered active is isolated for non-reclaiming purposes, e.g., migration.
|
||||
* See lru_gen_add_page() and lru_gen_del_page().
|
||||
*
|
||||
@@ -402,7 +402,7 @@ struct lru_gen_page {
|
||||
/* the birth time of each generation in jiffies */
|
||||
unsigned long timestamps[MAX_NR_GENS];
|
||||
/* the multi-gen LRU lists, lazily sorted on eviction */
|
||||
- struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
|
||||
+ struct list_head pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
|
||||
/* the multi-gen LRU sizes, eventually consistent */
|
||||
long nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
|
||||
/* the exponential moving average of refaulted */
|
||||
--- a/mm/vmscan.c
|
||||
+++ b/mm/vmscan.c
|
||||
@@ -3987,7 +3987,7 @@ static bool inc_min_seq(struct lruvec *l
|
||||
|
||||
/* prevent cold/hot inversion if force_scan is true */
|
||||
for (zone = 0; zone < MAX_NR_ZONES; zone++) {
|
||||
- struct list_head *head = &lrugen->lists[old_gen][type][zone];
|
||||
+ struct list_head *head = &lrugen->pages[old_gen][type][zone];
|
||||
|
||||
while (!list_empty(head)) {
|
||||
struct page *page = lru_to_page(head);
|
||||
@@ -3998,7 +3998,7 @@ static bool inc_min_seq(struct lruvec *l
|
||||
VM_WARN_ON_ONCE_PAGE(page_zonenum(page) != zone, page);
|
||||
|
||||
new_gen = page_inc_gen(lruvec, page, false);
|
||||
- list_move_tail(&page->lru, &lrugen->lists[new_gen][type][zone]);
|
||||
+ list_move_tail(&page->lru, &lrugen->pages[new_gen][type][zone]);
|
||||
|
||||
if (!--remaining)
|
||||
return false;
|
||||
@@ -4026,7 +4026,7 @@ static bool try_to_inc_min_seq(struct lr
|
||||
gen = lru_gen_from_seq(min_seq[type]);
|
||||
|
||||
for (zone = 0; zone < MAX_NR_ZONES; zone++) {
|
||||
- if (!list_empty(&lrugen->lists[gen][type][zone]))
|
||||
+ if (!list_empty(&lrugen->pages[gen][type][zone]))
|
||||
goto next;
|
||||
}
|
||||
|
||||
@@ -4491,7 +4491,7 @@ static bool sort_page(struct lruvec *lru
|
||||
|
||||
/* promoted */
|
||||
if (gen != lru_gen_from_seq(lrugen->min_seq[type])) {
|
||||
- list_move(&page->lru, &lrugen->lists[gen][type][zone]);
|
||||
+ list_move(&page->lru, &lrugen->pages[gen][type][zone]);
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -4500,7 +4500,7 @@ static bool sort_page(struct lruvec *lru
|
||||
int hist = lru_hist_from_seq(lrugen->min_seq[type]);
|
||||
|
||||
gen = page_inc_gen(lruvec, page, false);
|
||||
- list_move_tail(&page->lru, &lrugen->lists[gen][type][zone]);
|
||||
+ list_move_tail(&page->lru, &lrugen->pages[gen][type][zone]);
|
||||
|
||||
WRITE_ONCE(lrugen->protected[hist][type][tier - 1],
|
||||
lrugen->protected[hist][type][tier - 1] + delta);
|
||||
@@ -4512,7 +4512,7 @@ static bool sort_page(struct lruvec *lru
|
||||
if (PageLocked(page) || PageWriteback(page) ||
|
||||
(type == LRU_GEN_FILE && PageDirty(page))) {
|
||||
gen = page_inc_gen(lruvec, page, true);
|
||||
- list_move(&page->lru, &lrugen->lists[gen][type][zone]);
|
||||
+ list_move(&page->lru, &lrugen->pages[gen][type][zone]);
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -4579,7 +4579,7 @@ static int scan_pages(struct lruvec *lru
|
||||
for (zone = sc->reclaim_idx; zone >= 0; zone--) {
|
||||
LIST_HEAD(moved);
|
||||
int skipped = 0;
|
||||
- struct list_head *head = &lrugen->lists[gen][type][zone];
|
||||
+ struct list_head *head = &lrugen->pages[gen][type][zone];
|
||||
|
||||
while (!list_empty(head)) {
|
||||
struct page *page = lru_to_page(head);
|
||||
@@ -4980,7 +4980,7 @@ static bool __maybe_unused state_is_vali
|
||||
int gen, type, zone;
|
||||
|
||||
for_each_gen_type_zone(gen, type, zone) {
|
||||
- if (!list_empty(&lrugen->lists[gen][type][zone]))
|
||||
+ if (!list_empty(&lrugen->pages[gen][type][zone]))
|
||||
return false;
|
||||
}
|
||||
}
|
||||
@@ -5025,7 +5025,7 @@ static bool drain_evictable(struct lruve
|
||||
int remaining = MAX_LRU_BATCH;
|
||||
|
||||
for_each_gen_type_zone(gen, type, zone) {
|
||||
- struct list_head *head = &lruvec->lrugen.lists[gen][type][zone];
|
||||
+ struct list_head *head = &lruvec->lrugen.pages[gen][type][zone];
|
||||
|
||||
while (!list_empty(head)) {
|
||||
bool success;
|
||||
@@ -5558,7 +5558,7 @@ void lru_gen_init_lruvec(struct lruvec *
|
||||
lrugen->timestamps[i] = jiffies;
|
||||
|
||||
for_each_gen_type_zone(gen, type, zone)
|
||||
- INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]);
|
||||
+ INIT_LIST_HEAD(&lrugen->pages[gen][type][zone]);
|
||||
|
||||
lruvec->mm_state.seq = MIN_NR_GENS;
|
||||
init_waitqueue_head(&lruvec->mm_state.wait);
|
@ -0,0 +1,188 @@
|
||||
From ce45f1c4b32cf69b166f56ef5bc6c761e06ed4e5 Mon Sep 17 00:00:00 2001
|
||||
From: Yu Zhao <yuzhao@google.com>
|
||||
Date: Wed, 21 Dec 2022 21:19:01 -0700
|
||||
Subject: [PATCH 23/29] mm: multi-gen LRU: remove eviction fairness safeguard
|
||||
|
||||
Recall that the eviction consumes the oldest generation: first it
|
||||
bucket-sorts pages whose gen counters were updated by the aging and
|
||||
reclaims the rest; then it increments lrugen->min_seq.
|
||||
|
||||
The current eviction fairness safeguard for global reclaim has a
|
||||
dilemma: when there are multiple eligible memcgs, should it continue
|
||||
or stop upon meeting the reclaim goal? If it continues, it overshoots
|
||||
and increases direct reclaim latency; if it stops, it loses fairness
|
||||
between memcgs it has taken memory away from and those it has yet to.
|
||||
|
||||
With memcg LRU, the eviction, while ensuring eventual fairness, will
|
||||
stop upon meeting its goal. Therefore the current eviction fairness
|
||||
safeguard for global reclaim will not be needed.
|
||||
|
||||
Note that memcg LRU only applies to global reclaim. For memcg reclaim,
|
||||
the eviction will continue, even if it is overshooting. This becomes
|
||||
unconditional due to code simplification.
|
||||
|
||||
Link: https://lkml.kernel.org/r/20221222041905.2431096-4-yuzhao@google.com
|
||||
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
||||
Cc: Johannes Weiner <hannes@cmpxchg.org>
|
||||
Cc: Jonathan Corbet <corbet@lwn.net>
|
||||
Cc: Michael Larabel <Michael@MichaelLarabel.com>
|
||||
Cc: Michal Hocko <mhocko@kernel.org>
|
||||
Cc: Mike Rapoport <rppt@kernel.org>
|
||||
Cc: Roman Gushchin <roman.gushchin@linux.dev>
|
||||
Cc: Suren Baghdasaryan <surenb@google.com>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
mm/vmscan.c | 82 +++++++++++++++--------------------------------------
|
||||
1 file changed, 23 insertions(+), 59 deletions(-)
|
||||
|
||||
--- a/mm/vmscan.c
|
||||
+++ b/mm/vmscan.c
|
||||
@@ -443,6 +443,11 @@ static bool cgroup_reclaim(struct scan_c
|
||||
return sc->target_mem_cgroup;
|
||||
}
|
||||
|
||||
+static bool global_reclaim(struct scan_control *sc)
|
||||
+{
|
||||
+ return !sc->target_mem_cgroup || mem_cgroup_is_root(sc->target_mem_cgroup);
|
||||
+}
|
||||
+
|
||||
/**
|
||||
* writeback_throttling_sane - is the usual dirty throttling mechanism available?
|
||||
* @sc: scan_control in question
|
||||
@@ -493,6 +498,11 @@ static bool cgroup_reclaim(struct scan_c
|
||||
return false;
|
||||
}
|
||||
|
||||
+static bool global_reclaim(struct scan_control *sc)
|
||||
+{
|
||||
+ return true;
|
||||
+}
|
||||
+
|
||||
static bool writeback_throttling_sane(struct scan_control *sc)
|
||||
{
|
||||
return true;
|
||||
@@ -4722,8 +4732,7 @@ static int isolate_pages(struct lruvec *
|
||||
return scanned;
|
||||
}
|
||||
|
||||
-static int evict_pages(struct lruvec *lruvec, struct scan_control *sc, int swappiness,
|
||||
- bool *need_swapping)
|
||||
+static int evict_pages(struct lruvec *lruvec, struct scan_control *sc, int swappiness)
|
||||
{
|
||||
int type;
|
||||
int scanned;
|
||||
@@ -4812,9 +4821,6 @@ retry:
|
||||
goto retry;
|
||||
}
|
||||
|
||||
- if (need_swapping && type == LRU_GEN_ANON)
|
||||
- *need_swapping = true;
|
||||
-
|
||||
return scanned;
|
||||
}
|
||||
|
||||
@@ -4853,68 +4859,26 @@ done:
|
||||
return min_seq[!can_swap] + MIN_NR_GENS <= max_seq ? nr_to_scan : 0;
|
||||
}
|
||||
|
||||
-static bool should_abort_scan(struct lruvec *lruvec, unsigned long seq,
|
||||
- struct scan_control *sc, bool need_swapping)
|
||||
+static unsigned long get_nr_to_reclaim(struct scan_control *sc)
|
||||
{
|
||||
- int i;
|
||||
- DEFINE_MAX_SEQ(lruvec);
|
||||
-
|
||||
- if (!current_is_kswapd()) {
|
||||
- /* age each memcg once to ensure fairness */
|
||||
- if (max_seq - seq > 1)
|
||||
- return true;
|
||||
-
|
||||
- /* over-swapping can increase allocation latency */
|
||||
- if (sc->nr_reclaimed >= sc->nr_to_reclaim && need_swapping)
|
||||
- return true;
|
||||
-
|
||||
- /* give this thread a chance to exit and free its memory */
|
||||
- if (fatal_signal_pending(current)) {
|
||||
- sc->nr_reclaimed += MIN_LRU_BATCH;
|
||||
- return true;
|
||||
- }
|
||||
-
|
||||
- if (cgroup_reclaim(sc))
|
||||
- return false;
|
||||
- } else if (sc->nr_reclaimed - sc->last_reclaimed < sc->nr_to_reclaim)
|
||||
- return false;
|
||||
-
|
||||
- /* keep scanning at low priorities to ensure fairness */
|
||||
- if (sc->priority > DEF_PRIORITY - 2)
|
||||
- return false;
|
||||
-
|
||||
- /*
|
||||
- * A minimum amount of work was done under global memory pressure. For
|
||||
- * kswapd, it may be overshooting. For direct reclaim, the target isn't
|
||||
- * met, and yet the allocation may still succeed, since kswapd may have
|
||||
- * caught up. In either case, it's better to stop now, and restart if
|
||||
- * necessary.
|
||||
- */
|
||||
- for (i = 0; i <= sc->reclaim_idx; i++) {
|
||||
- unsigned long wmark;
|
||||
- struct zone *zone = lruvec_pgdat(lruvec)->node_zones + i;
|
||||
-
|
||||
- if (!managed_zone(zone))
|
||||
- continue;
|
||||
-
|
||||
- wmark = current_is_kswapd() ? high_wmark_pages(zone) : low_wmark_pages(zone);
|
||||
- if (wmark > zone_page_state(zone, NR_FREE_PAGES))
|
||||
- return false;
|
||||
- }
|
||||
+ /* don't abort memcg reclaim to ensure fairness */
|
||||
+ if (!global_reclaim(sc))
|
||||
+ return -1;
|
||||
|
||||
- sc->nr_reclaimed += MIN_LRU_BATCH;
|
||||
+ /* discount the previous progress for kswapd */
|
||||
+ if (current_is_kswapd())
|
||||
+ return sc->nr_to_reclaim + sc->last_reclaimed;
|
||||
|
||||
- return true;
|
||||
+ return max(sc->nr_to_reclaim, compact_gap(sc->order));
|
||||
}
|
||||
|
||||
static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
|
||||
{
|
||||
struct blk_plug plug;
|
||||
bool need_aging = false;
|
||||
- bool need_swapping = false;
|
||||
unsigned long scanned = 0;
|
||||
unsigned long reclaimed = sc->nr_reclaimed;
|
||||
- DEFINE_MAX_SEQ(lruvec);
|
||||
+ unsigned long nr_to_reclaim = get_nr_to_reclaim(sc);
|
||||
|
||||
lru_add_drain();
|
||||
|
||||
@@ -4938,7 +4902,7 @@ static void lru_gen_shrink_lruvec(struct
|
||||
if (!nr_to_scan)
|
||||
goto done;
|
||||
|
||||
- delta = evict_pages(lruvec, sc, swappiness, &need_swapping);
|
||||
+ delta = evict_pages(lruvec, sc, swappiness);
|
||||
if (!delta)
|
||||
goto done;
|
||||
|
||||
@@ -4946,7 +4910,7 @@ static void lru_gen_shrink_lruvec(struct
|
||||
if (scanned >= nr_to_scan)
|
||||
break;
|
||||
|
||||
- if (should_abort_scan(lruvec, max_seq, sc, need_swapping))
|
||||
+ if (sc->nr_reclaimed >= nr_to_reclaim)
|
||||
break;
|
||||
|
||||
cond_resched();
|
||||
@@ -5393,7 +5357,7 @@ static int run_eviction(struct lruvec *l
|
||||
if (sc->nr_reclaimed >= nr_to_reclaim)
|
||||
return 0;
|
||||
|
||||
- if (!evict_pages(lruvec, sc, swappiness, NULL))
|
||||
+ if (!evict_pages(lruvec, sc, swappiness))
|
||||
return 0;
|
||||
|
||||
cond_resched();
|
@ -0,0 +1,287 @@
|
||||
From e20b7386fccc18c791796eb1dc1a91eee3ccf801 Mon Sep 17 00:00:00 2001
|
||||
From: Yu Zhao <yuzhao@google.com>
|
||||
Date: Wed, 21 Dec 2022 21:19:02 -0700
|
||||
Subject: [PATCH 24/29] mm: multi-gen LRU: remove aging fairness safeguard
|
||||
|
||||
Recall that the aging produces the youngest generation: first it scans
|
||||
for accessed pages and updates their gen counters; then it increments
|
||||
lrugen->max_seq.
|
||||
|
||||
The current aging fairness safeguard for kswapd uses two passes to
|
||||
ensure the fairness to multiple eligible memcgs. On the first pass,
|
||||
which is shared with the eviction, it checks whether all eligible
|
||||
memcgs are low on cold pages. If so, it requires a second pass, on
|
||||
which it ages all those memcgs at the same time.
|
||||
|
||||
With memcg LRU, the aging, while ensuring eventual fairness, will run
|
||||
when necessary. Therefore the current aging fairness safeguard for
|
||||
kswapd will not be needed.
|
||||
|
||||
Note that memcg LRU only applies to global reclaim. For memcg reclaim,
|
||||
the aging can be unfair to different memcgs, i.e., their
|
||||
lrugen->max_seq can be incremented at different paces.
|
||||
|
||||
Link: https://lkml.kernel.org/r/20221222041905.2431096-5-yuzhao@google.com
|
||||
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
||||
Cc: Johannes Weiner <hannes@cmpxchg.org>
|
||||
Cc: Jonathan Corbet <corbet@lwn.net>
|
||||
Cc: Michael Larabel <Michael@MichaelLarabel.com>
|
||||
Cc: Michal Hocko <mhocko@kernel.org>
|
||||
Cc: Mike Rapoport <rppt@kernel.org>
|
||||
Cc: Roman Gushchin <roman.gushchin@linux.dev>
|
||||
Cc: Suren Baghdasaryan <surenb@google.com>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
mm/vmscan.c | 126 ++++++++++++++++++++++++----------------------------
|
||||
1 file changed, 59 insertions(+), 67 deletions(-)
|
||||
|
||||
--- a/mm/vmscan.c
|
||||
+++ b/mm/vmscan.c
|
||||
@@ -131,7 +131,6 @@ struct scan_control {
|
||||
|
||||
#ifdef CONFIG_LRU_GEN
|
||||
/* help kswapd make better choices among multiple memcgs */
|
||||
- unsigned int memcgs_need_aging:1;
|
||||
unsigned long last_reclaimed;
|
||||
#endif
|
||||
|
||||
@@ -4184,7 +4183,7 @@ done:
|
||||
return true;
|
||||
}
|
||||
|
||||
-static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, unsigned long *min_seq,
|
||||
+static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq,
|
||||
struct scan_control *sc, bool can_swap, unsigned long *nr_to_scan)
|
||||
{
|
||||
int gen, type, zone;
|
||||
@@ -4193,6 +4192,13 @@ static bool should_run_aging(struct lruv
|
||||
unsigned long total = 0;
|
||||
struct lru_gen_page *lrugen = &lruvec->lrugen;
|
||||
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
|
||||
+ DEFINE_MIN_SEQ(lruvec);
|
||||
+
|
||||
+ /* whether this lruvec is completely out of cold pages */
|
||||
+ if (min_seq[!can_swap] + MIN_NR_GENS > max_seq) {
|
||||
+ *nr_to_scan = 0;
|
||||
+ return true;
|
||||
+ }
|
||||
|
||||
for (type = !can_swap; type < ANON_AND_FILE; type++) {
|
||||
unsigned long seq;
|
||||
@@ -4221,8 +4227,6 @@ static bool should_run_aging(struct lruv
|
||||
* stalls when the number of generations reaches MIN_NR_GENS. Hence, the
|
||||
* ideal number of generations is MIN_NR_GENS+1.
|
||||
*/
|
||||
- if (min_seq[!can_swap] + MIN_NR_GENS > max_seq)
|
||||
- return true;
|
||||
if (min_seq[!can_swap] + MIN_NR_GENS < max_seq)
|
||||
return false;
|
||||
|
||||
@@ -4241,40 +4245,54 @@ static bool should_run_aging(struct lruv
|
||||
return false;
|
||||
}
|
||||
|
||||
-static bool age_lruvec(struct lruvec *lruvec, struct scan_control *sc, unsigned long min_ttl)
|
||||
+static bool lruvec_is_sizable(struct lruvec *lruvec, struct scan_control *sc)
|
||||
{
|
||||
- bool need_aging;
|
||||
- unsigned long nr_to_scan;
|
||||
- int swappiness = get_swappiness(lruvec, sc);
|
||||
+ int gen, type, zone;
|
||||
+ unsigned long total = 0;
|
||||
+ bool can_swap = get_swappiness(lruvec, sc);
|
||||
+ struct lru_gen_page *lrugen = &lruvec->lrugen;
|
||||
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
|
||||
DEFINE_MAX_SEQ(lruvec);
|
||||
DEFINE_MIN_SEQ(lruvec);
|
||||
|
||||
- VM_WARN_ON_ONCE(sc->memcg_low_reclaim);
|
||||
+ for (type = !can_swap; type < ANON_AND_FILE; type++) {
|
||||
+ unsigned long seq;
|
||||
|
||||
- mem_cgroup_calculate_protection(NULL, memcg);
|
||||
+ for (seq = min_seq[type]; seq <= max_seq; seq++) {
|
||||
+ gen = lru_gen_from_seq(seq);
|
||||
|
||||
- if (mem_cgroup_below_min(memcg))
|
||||
- return false;
|
||||
+ for (zone = 0; zone < MAX_NR_ZONES; zone++)
|
||||
+ total += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L);
|
||||
+ }
|
||||
+ }
|
||||
|
||||
- need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, swappiness, &nr_to_scan);
|
||||
+ /* whether the size is big enough to be helpful */
|
||||
+ return mem_cgroup_online(memcg) ? (total >> sc->priority) : total;
|
||||
+}
|
||||
|
||||
- if (min_ttl) {
|
||||
- int gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]);
|
||||
- unsigned long birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);
|
||||
+static bool lruvec_is_reclaimable(struct lruvec *lruvec, struct scan_control *sc,
|
||||
+ unsigned long min_ttl)
|
||||
+{
|
||||
+ int gen;
|
||||
+ unsigned long birth;
|
||||
+ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
|
||||
+ DEFINE_MIN_SEQ(lruvec);
|
||||
|
||||
- if (time_is_after_jiffies(birth + min_ttl))
|
||||
- return false;
|
||||
+ VM_WARN_ON_ONCE(sc->memcg_low_reclaim);
|
||||
|
||||
- /* the size is likely too small to be helpful */
|
||||
- if (!nr_to_scan && sc->priority != DEF_PRIORITY)
|
||||
- return false;
|
||||
- }
|
||||
+ /* see the comment on lru_gen_page */
|
||||
+ gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]);
|
||||
+ birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);
|
||||
|
||||
- if (need_aging)
|
||||
- try_to_inc_max_seq(lruvec, max_seq, sc, swappiness, false);
|
||||
+ if (time_is_after_jiffies(birth + min_ttl))
|
||||
+ return false;
|
||||
|
||||
- return true;
|
||||
+ if (!lruvec_is_sizable(lruvec, sc))
|
||||
+ return false;
|
||||
+
|
||||
+ mem_cgroup_calculate_protection(NULL, memcg);
|
||||
+
|
||||
+ return !mem_cgroup_below_min(memcg);
|
||||
}
|
||||
|
||||
/* to protect the working set of the last N jiffies */
|
||||
@@ -4283,46 +4301,32 @@ static unsigned long lru_gen_min_ttl __r
|
||||
static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
|
||||
{
|
||||
struct mem_cgroup *memcg;
|
||||
- bool success = false;
|
||||
unsigned long min_ttl = READ_ONCE(lru_gen_min_ttl);
|
||||
|
||||
VM_WARN_ON_ONCE(!current_is_kswapd());
|
||||
|
||||
sc->last_reclaimed = sc->nr_reclaimed;
|
||||
|
||||
- /*
|
||||
- * To reduce the chance of going into the aging path, which can be
|
||||
- * costly, optimistically skip it if the flag below was cleared in the
|
||||
- * eviction path. This improves the overall performance when multiple
|
||||
- * memcgs are available.
|
||||
- */
|
||||
- if (!sc->memcgs_need_aging) {
|
||||
- sc->memcgs_need_aging = true;
|
||||
+ /* check the order to exclude compaction-induced reclaim */
|
||||
+ if (!min_ttl || sc->order || sc->priority == DEF_PRIORITY)
|
||||
return;
|
||||
- }
|
||||
-
|
||||
- set_mm_walk(pgdat);
|
||||
|
||||
memcg = mem_cgroup_iter(NULL, NULL, NULL);
|
||||
do {
|
||||
struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
|
||||
|
||||
- if (age_lruvec(lruvec, sc, min_ttl))
|
||||
- success = true;
|
||||
+ if (lruvec_is_reclaimable(lruvec, sc, min_ttl)) {
|
||||
+ mem_cgroup_iter_break(NULL, memcg);
|
||||
+ return;
|
||||
+ }
|
||||
|
||||
cond_resched();
|
||||
} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
|
||||
|
||||
- clear_mm_walk();
|
||||
-
|
||||
- /* check the order to exclude compaction-induced reclaim */
|
||||
- if (success || !min_ttl || sc->order)
|
||||
- return;
|
||||
-
|
||||
/*
|
||||
* The main goal is to OOM kill if every generation from all memcgs is
|
||||
* younger than min_ttl. However, another possibility is all memcgs are
|
||||
- * either below min or empty.
|
||||
+ * either too small or below min.
|
||||
*/
|
||||
if (mutex_trylock(&oom_lock)) {
|
||||
struct oom_control oc = {
|
||||
@@ -4830,33 +4834,27 @@ retry:
|
||||
* reclaim.
|
||||
*/
|
||||
static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc,
|
||||
- bool can_swap, bool *need_aging)
|
||||
+ bool can_swap)
|
||||
{
|
||||
unsigned long nr_to_scan;
|
||||
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
|
||||
DEFINE_MAX_SEQ(lruvec);
|
||||
- DEFINE_MIN_SEQ(lruvec);
|
||||
|
||||
if (mem_cgroup_below_min(memcg) ||
|
||||
(mem_cgroup_below_low(memcg) && !sc->memcg_low_reclaim))
|
||||
return 0;
|
||||
|
||||
- *need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, can_swap, &nr_to_scan);
|
||||
- if (!*need_aging)
|
||||
+ if (!should_run_aging(lruvec, max_seq, sc, can_swap, &nr_to_scan))
|
||||
return nr_to_scan;
|
||||
|
||||
/* skip the aging path at the default priority */
|
||||
if (sc->priority == DEF_PRIORITY)
|
||||
- goto done;
|
||||
+ return nr_to_scan;
|
||||
|
||||
- /* leave the work to lru_gen_age_node() */
|
||||
- if (current_is_kswapd())
|
||||
- return 0;
|
||||
+ try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false);
|
||||
|
||||
- if (try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false))
|
||||
- return nr_to_scan;
|
||||
-done:
|
||||
- return min_seq[!can_swap] + MIN_NR_GENS <= max_seq ? nr_to_scan : 0;
|
||||
+ /* skip this lruvec as it's low on cold pages */
|
||||
+ return 0;
|
||||
}
|
||||
|
||||
static unsigned long get_nr_to_reclaim(struct scan_control *sc)
|
||||
@@ -4875,9 +4873,7 @@ static unsigned long get_nr_to_reclaim(s
|
||||
static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
|
||||
{
|
||||
struct blk_plug plug;
|
||||
- bool need_aging = false;
|
||||
unsigned long scanned = 0;
|
||||
- unsigned long reclaimed = sc->nr_reclaimed;
|
||||
unsigned long nr_to_reclaim = get_nr_to_reclaim(sc);
|
||||
|
||||
lru_add_drain();
|
||||
@@ -4898,13 +4894,13 @@ static void lru_gen_shrink_lruvec(struct
|
||||
else
|
||||
swappiness = 0;
|
||||
|
||||
- nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness, &need_aging);
|
||||
+ nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness);
|
||||
if (!nr_to_scan)
|
||||
- goto done;
|
||||
+ break;
|
||||
|
||||
delta = evict_pages(lruvec, sc, swappiness);
|
||||
if (!delta)
|
||||
- goto done;
|
||||
+ break;
|
||||
|
||||
scanned += delta;
|
||||
if (scanned >= nr_to_scan)
|
||||
@@ -4916,10 +4912,6 @@ static void lru_gen_shrink_lruvec(struct
|
||||
cond_resched();
|
||||
}
|
||||
|
||||
- /* see the comment in lru_gen_age_node() */
|
||||
- if (sc->nr_reclaimed - reclaimed >= MIN_LRU_BATCH && !need_aging)
|
||||
- sc->memcgs_need_aging = false;
|
||||
-done:
|
||||
clear_mm_walk();
|
||||
|
||||
blk_finish_plug(&plug);
|
@ -0,0 +1,161 @@
|
||||
From 107d54931df3c28d81648122e219bf0034ef4e99 Mon Sep 17 00:00:00 2001
|
||||
From: Yu Zhao <yuzhao@google.com>
|
||||
Date: Wed, 21 Dec 2022 21:19:03 -0700
|
||||
Subject: [PATCH 25/29] mm: multi-gen LRU: shuffle should_run_aging()
|
||||
|
||||
Move should_run_aging() next to its only caller left.
|
||||
|
||||
Link: https://lkml.kernel.org/r/20221222041905.2431096-6-yuzhao@google.com
|
||||
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
||||
Cc: Johannes Weiner <hannes@cmpxchg.org>
|
||||
Cc: Jonathan Corbet <corbet@lwn.net>
|
||||
Cc: Michael Larabel <Michael@MichaelLarabel.com>
|
||||
Cc: Michal Hocko <mhocko@kernel.org>
|
||||
Cc: Mike Rapoport <rppt@kernel.org>
|
||||
Cc: Roman Gushchin <roman.gushchin@linux.dev>
|
||||
Cc: Suren Baghdasaryan <surenb@google.com>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
mm/vmscan.c | 124 ++++++++++++++++++++++++++--------------------------
|
||||
1 file changed, 62 insertions(+), 62 deletions(-)
|
||||
|
||||
--- a/mm/vmscan.c
|
||||
+++ b/mm/vmscan.c
|
||||
@@ -4183,68 +4183,6 @@ done:
|
||||
return true;
|
||||
}
|
||||
|
||||
-static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq,
|
||||
- struct scan_control *sc, bool can_swap, unsigned long *nr_to_scan)
|
||||
-{
|
||||
- int gen, type, zone;
|
||||
- unsigned long old = 0;
|
||||
- unsigned long young = 0;
|
||||
- unsigned long total = 0;
|
||||
- struct lru_gen_page *lrugen = &lruvec->lrugen;
|
||||
- struct mem_cgroup *memcg = lruvec_memcg(lruvec);
|
||||
- DEFINE_MIN_SEQ(lruvec);
|
||||
-
|
||||
- /* whether this lruvec is completely out of cold pages */
|
||||
- if (min_seq[!can_swap] + MIN_NR_GENS > max_seq) {
|
||||
- *nr_to_scan = 0;
|
||||
- return true;
|
||||
- }
|
||||
-
|
||||
- for (type = !can_swap; type < ANON_AND_FILE; type++) {
|
||||
- unsigned long seq;
|
||||
-
|
||||
- for (seq = min_seq[type]; seq <= max_seq; seq++) {
|
||||
- unsigned long size = 0;
|
||||
-
|
||||
- gen = lru_gen_from_seq(seq);
|
||||
-
|
||||
- for (zone = 0; zone < MAX_NR_ZONES; zone++)
|
||||
- size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L);
|
||||
-
|
||||
- total += size;
|
||||
- if (seq == max_seq)
|
||||
- young += size;
|
||||
- else if (seq + MIN_NR_GENS == max_seq)
|
||||
- old += size;
|
||||
- }
|
||||
- }
|
||||
-
|
||||
- /* try to scrape all its memory if this memcg was deleted */
|
||||
- *nr_to_scan = mem_cgroup_online(memcg) ? (total >> sc->priority) : total;
|
||||
-
|
||||
- /*
|
||||
- * The aging tries to be lazy to reduce the overhead, while the eviction
|
||||
- * stalls when the number of generations reaches MIN_NR_GENS. Hence, the
|
||||
- * ideal number of generations is MIN_NR_GENS+1.
|
||||
- */
|
||||
- if (min_seq[!can_swap] + MIN_NR_GENS < max_seq)
|
||||
- return false;
|
||||
-
|
||||
- /*
|
||||
- * It's also ideal to spread pages out evenly, i.e., 1/(MIN_NR_GENS+1)
|
||||
- * of the total number of pages for each generation. A reasonable range
|
||||
- * for this average portion is [1/MIN_NR_GENS, 1/(MIN_NR_GENS+2)]. The
|
||||
- * aging cares about the upper bound of hot pages, while the eviction
|
||||
- * cares about the lower bound of cold pages.
|
||||
- */
|
||||
- if (young * MIN_NR_GENS > total)
|
||||
- return true;
|
||||
- if (old * (MIN_NR_GENS + 2) < total)
|
||||
- return true;
|
||||
-
|
||||
- return false;
|
||||
-}
|
||||
-
|
||||
static bool lruvec_is_sizable(struct lruvec *lruvec, struct scan_control *sc)
|
||||
{
|
||||
int gen, type, zone;
|
||||
@@ -4828,6 +4766,68 @@ retry:
|
||||
return scanned;
|
||||
}
|
||||
|
||||
+static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq,
|
||||
+ struct scan_control *sc, bool can_swap, unsigned long *nr_to_scan)
|
||||
+{
|
||||
+ int gen, type, zone;
|
||||
+ unsigned long old = 0;
|
||||
+ unsigned long young = 0;
|
||||
+ unsigned long total = 0;
|
||||
+ struct lru_gen_page *lrugen = &lruvec->lrugen;
|
||||
+ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
|
||||
+ DEFINE_MIN_SEQ(lruvec);
|
||||
+
|
||||
+ /* whether this lruvec is completely out of cold pages */
|
||||
+ if (min_seq[!can_swap] + MIN_NR_GENS > max_seq) {
|
||||
+ *nr_to_scan = 0;
|
||||
+ return true;
|
||||
+ }
|
||||
+
|
||||
+ for (type = !can_swap; type < ANON_AND_FILE; type++) {
|
||||
+ unsigned long seq;
|
||||
+
|
||||
+ for (seq = min_seq[type]; seq <= max_seq; seq++) {
|
||||
+ unsigned long size = 0;
|
||||
+
|
||||
+ gen = lru_gen_from_seq(seq);
|
||||
+
|
||||
+ for (zone = 0; zone < MAX_NR_ZONES; zone++)
|
||||
+ size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L);
|
||||
+
|
||||
+ total += size;
|
||||
+ if (seq == max_seq)
|
||||
+ young += size;
|
||||
+ else if (seq + MIN_NR_GENS == max_seq)
|
||||
+ old += size;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ /* try to scrape all its memory if this memcg was deleted */
|
||||
+ *nr_to_scan = mem_cgroup_online(memcg) ? (total >> sc->priority) : total;
|
||||
+
|
||||
+ /*
|
||||
+ * The aging tries to be lazy to reduce the overhead, while the eviction
|
||||
+ * stalls when the number of generations reaches MIN_NR_GENS. Hence, the
|
||||
+ * ideal number of generations is MIN_NR_GENS+1.
|
||||
+ */
|
||||
+ if (min_seq[!can_swap] + MIN_NR_GENS < max_seq)
|
||||
+ return false;
|
||||
+
|
||||
+ /*
|
||||
+ * It's also ideal to spread pages out evenly, i.e., 1/(MIN_NR_GENS+1)
|
||||
+ * of the total number of pages for each generation. A reasonable range
|
||||
+ * for this average portion is [1/MIN_NR_GENS, 1/(MIN_NR_GENS+2)]. The
|
||||
+ * aging cares about the upper bound of hot pages, while the eviction
|
||||
+ * cares about the lower bound of cold pages.
|
||||
+ */
|
||||
+ if (young * MIN_NR_GENS > total)
|
||||
+ return true;
|
||||
+ if (old * (MIN_NR_GENS + 2) < total)
|
||||
+ return true;
|
||||
+
|
||||
+ return false;
|
||||
+}
|
||||
+
|
||||
/*
|
||||
* For future optimizations:
|
||||
* 1. Defer try_to_inc_max_seq() to workqueues to reduce latency for memcg
|
@ -0,0 +1,868 @@
|
||||
From fa6363828d314e837c5f79e97ea5e8c0d2f7f062 Mon Sep 17 00:00:00 2001
|
||||
From: Yu Zhao <yuzhao@google.com>
|
||||
Date: Wed, 21 Dec 2022 21:19:04 -0700
|
||||
Subject: [PATCH 26/29] mm: multi-gen LRU: per-node lru_gen_page lists
|
||||
|
||||
For each node, memcgs are divided into two generations: the old and
|
||||
the young. For each generation, memcgs are randomly sharded into
|
||||
multiple bins to improve scalability. For each bin, an RCU hlist_nulls
|
||||
is virtually divided into three segments: the head, the tail and the
|
||||
default.
|
||||
|
||||
An onlining memcg is added to the tail of a random bin in the old
|
||||
generation. The eviction starts at the head of a random bin in the old
|
||||
generation. The per-node memcg generation counter, whose reminder (mod
|
||||
2) indexes the old generation, is incremented when all its bins become
|
||||
empty.
|
||||
|
||||
There are four operations:
|
||||
1. MEMCG_LRU_HEAD, which moves an memcg to the head of a random bin in
|
||||
its current generation (old or young) and updates its "seg" to
|
||||
"head";
|
||||
2. MEMCG_LRU_TAIL, which moves an memcg to the tail of a random bin in
|
||||
its current generation (old or young) and updates its "seg" to
|
||||
"tail";
|
||||
3. MEMCG_LRU_OLD, which moves an memcg to the head of a random bin in
|
||||
the old generation, updates its "gen" to "old" and resets its "seg"
|
||||
to "default";
|
||||
4. MEMCG_LRU_YOUNG, which moves an memcg to the tail of a random bin
|
||||
in the young generation, updates its "gen" to "young" and resets
|
||||
its "seg" to "default".
|
||||
|
||||
The events that trigger the above operations are:
|
||||
1. Exceeding the soft limit, which triggers MEMCG_LRU_HEAD;
|
||||
2. The first attempt to reclaim an memcg below low, which triggers
|
||||
MEMCG_LRU_TAIL;
|
||||
3. The first attempt to reclaim an memcg below reclaimable size
|
||||
threshold, which triggers MEMCG_LRU_TAIL;
|
||||
4. The second attempt to reclaim an memcg below reclaimable size
|
||||
threshold, which triggers MEMCG_LRU_YOUNG;
|
||||
5. Attempting to reclaim an memcg below min, which triggers
|
||||
MEMCG_LRU_YOUNG;
|
||||
6. Finishing the aging on the eviction path, which triggers
|
||||
MEMCG_LRU_YOUNG;
|
||||
7. Offlining an memcg, which triggers MEMCG_LRU_OLD.
|
||||
|
||||
Note that memcg LRU only applies to global reclaim, and the
|
||||
round-robin incrementing of their max_seq counters ensures the
|
||||
eventual fairness to all eligible memcgs. For memcg reclaim, it still
|
||||
relies on mem_cgroup_iter().
|
||||
|
||||
Link: https://lkml.kernel.org/r/20221222041905.2431096-7-yuzhao@google.com
|
||||
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
||||
Cc: Johannes Weiner <hannes@cmpxchg.org>
|
||||
Cc: Jonathan Corbet <corbet@lwn.net>
|
||||
Cc: Michael Larabel <Michael@MichaelLarabel.com>
|
||||
Cc: Michal Hocko <mhocko@kernel.org>
|
||||
Cc: Mike Rapoport <rppt@kernel.org>
|
||||
Cc: Roman Gushchin <roman.gushchin@linux.dev>
|
||||
Cc: Suren Baghdasaryan <surenb@google.com>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
include/linux/memcontrol.h | 10 +
|
||||
include/linux/mm_inline.h | 17 ++
|
||||
include/linux/mmzone.h | 117 +++++++++++-
|
||||
mm/memcontrol.c | 16 ++
|
||||
mm/page_alloc.c | 1 +
|
||||
mm/vmscan.c | 373 +++++++++++++++++++++++++++++++++----
|
||||
6 files changed, 499 insertions(+), 35 deletions(-)
|
||||
|
||||
--- a/include/linux/memcontrol.h
|
||||
+++ b/include/linux/memcontrol.h
|
||||
@@ -818,6 +818,11 @@ static inline void obj_cgroup_put(struct
|
||||
percpu_ref_put(&objcg->refcnt);
|
||||
}
|
||||
|
||||
+static inline bool mem_cgroup_tryget(struct mem_cgroup *memcg)
|
||||
+{
|
||||
+ return !memcg || css_tryget(&memcg->css);
|
||||
+}
|
||||
+
|
||||
static inline void mem_cgroup_put(struct mem_cgroup *memcg)
|
||||
{
|
||||
if (memcg)
|
||||
@@ -1283,6 +1288,11 @@ struct mem_cgroup *mem_cgroup_from_css(s
|
||||
return NULL;
|
||||
}
|
||||
|
||||
+static inline bool mem_cgroup_tryget(struct mem_cgroup *memcg)
|
||||
+{
|
||||
+ return true;
|
||||
+}
|
||||
+
|
||||
static inline void mem_cgroup_put(struct mem_cgroup *memcg)
|
||||
{
|
||||
}
|
||||
--- a/include/linux/mm_inline.h
|
||||
+++ b/include/linux/mm_inline.h
|
||||
@@ -112,6 +112,18 @@ static inline bool lru_gen_in_fault(void
|
||||
return current->in_lru_fault;
|
||||
}
|
||||
|
||||
+#ifdef CONFIG_MEMCG
|
||||
+static inline int lru_gen_memcg_seg(struct lruvec *lruvec)
|
||||
+{
|
||||
+ return READ_ONCE(lruvec->lrugen.seg);
|
||||
+}
|
||||
+#else
|
||||
+static inline int lru_gen_memcg_seg(struct lruvec *lruvec)
|
||||
+{
|
||||
+ return 0;
|
||||
+}
|
||||
+#endif
|
||||
+
|
||||
static inline int lru_gen_from_seq(unsigned long seq)
|
||||
{
|
||||
return seq % MAX_NR_GENS;
|
||||
@@ -287,6 +299,11 @@ static inline bool lru_gen_in_fault(void
|
||||
return false;
|
||||
}
|
||||
|
||||
+static inline int lru_gen_memcg_seg(struct lruvec *lruvec)
|
||||
+{
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
static inline bool lru_gen_add_page(struct lruvec *lruvec, struct page *page, bool reclaiming)
|
||||
{
|
||||
return false;
|
||||
--- a/include/linux/mmzone.h
|
||||
+++ b/include/linux/mmzone.h
|
||||
@@ -7,6 +7,7 @@
|
||||
|
||||
#include <linux/spinlock.h>
|
||||
#include <linux/list.h>
|
||||
+#include <linux/list_nulls.h>
|
||||
#include <linux/wait.h>
|
||||
#include <linux/bitops.h>
|
||||
#include <linux/cache.h>
|
||||
@@ -357,6 +358,15 @@ struct page_vma_mapped_walk;
|
||||
#define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF)
|
||||
#define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF)
|
||||
|
||||
+/* see the comment on MEMCG_NR_GENS */
|
||||
+enum {
|
||||
+ MEMCG_LRU_NOP,
|
||||
+ MEMCG_LRU_HEAD,
|
||||
+ MEMCG_LRU_TAIL,
|
||||
+ MEMCG_LRU_OLD,
|
||||
+ MEMCG_LRU_YOUNG,
|
||||
+};
|
||||
+
|
||||
#ifdef CONFIG_LRU_GEN
|
||||
|
||||
enum {
|
||||
@@ -416,6 +426,14 @@ struct lru_gen_page {
|
||||
atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
|
||||
/* whether the multi-gen LRU is enabled */
|
||||
bool enabled;
|
||||
+#ifdef CONFIG_MEMCG
|
||||
+ /* the memcg generation this lru_gen_page belongs to */
|
||||
+ u8 gen;
|
||||
+ /* the list segment this lru_gen_page belongs to */
|
||||
+ u8 seg;
|
||||
+ /* per-node lru_gen_page list for global reclaim */
|
||||
+ struct hlist_nulls_node list;
|
||||
+#endif
|
||||
};
|
||||
|
||||
enum {
|
||||
@@ -469,12 +487,87 @@ void lru_gen_init_lruvec(struct lruvec *
|
||||
void lru_gen_look_around(struct page_vma_mapped_walk *pvmw);
|
||||
|
||||
#ifdef CONFIG_MEMCG
|
||||
+
|
||||
+/*
|
||||
+ * For each node, memcgs are divided into two generations: the old and the
|
||||
+ * young. For each generation, memcgs are randomly sharded into multiple bins
|
||||
+ * to improve scalability. For each bin, the hlist_nulls is virtually divided
|
||||
+ * into three segments: the head, the tail and the default.
|
||||
+ *
|
||||
+ * An onlining memcg is added to the tail of a random bin in the old generation.
|
||||
+ * The eviction starts at the head of a random bin in the old generation. The
|
||||
+ * per-node memcg generation counter, whose reminder (mod MEMCG_NR_GENS) indexes
|
||||
+ * the old generation, is incremented when all its bins become empty.
|
||||
+ *
|
||||
+ * There are four operations:
|
||||
+ * 1. MEMCG_LRU_HEAD, which moves an memcg to the head of a random bin in its
|
||||
+ * current generation (old or young) and updates its "seg" to "head";
|
||||
+ * 2. MEMCG_LRU_TAIL, which moves an memcg to the tail of a random bin in its
|
||||
+ * current generation (old or young) and updates its "seg" to "tail";
|
||||
+ * 3. MEMCG_LRU_OLD, which moves an memcg to the head of a random bin in the old
|
||||
+ * generation, updates its "gen" to "old" and resets its "seg" to "default";
|
||||
+ * 4. MEMCG_LRU_YOUNG, which moves an memcg to the tail of a random bin in the
|
||||
+ * young generation, updates its "gen" to "young" and resets its "seg" to
|
||||
+ * "default".
|
||||
+ *
|
||||
+ * The events that trigger the above operations are:
|
||||
+ * 1. Exceeding the soft limit, which triggers MEMCG_LRU_HEAD;
|
||||
+ * 2. The first attempt to reclaim an memcg below low, which triggers
|
||||
+ * MEMCG_LRU_TAIL;
|
||||
+ * 3. The first attempt to reclaim an memcg below reclaimable size threshold,
|
||||
+ * which triggers MEMCG_LRU_TAIL;
|
||||
+ * 4. The second attempt to reclaim an memcg below reclaimable size threshold,
|
||||
+ * which triggers MEMCG_LRU_YOUNG;
|
||||
+ * 5. Attempting to reclaim an memcg below min, which triggers MEMCG_LRU_YOUNG;
|
||||
+ * 6. Finishing the aging on the eviction path, which triggers MEMCG_LRU_YOUNG;
|
||||
+ * 7. Offlining an memcg, which triggers MEMCG_LRU_OLD.
|
||||
+ *
|
||||
+ * Note that memcg LRU only applies to global reclaim, and the round-robin
|
||||
+ * incrementing of their max_seq counters ensures the eventual fairness to all
|
||||
+ * eligible memcgs. For memcg reclaim, it still relies on mem_cgroup_iter().
|
||||
+ */
|
||||
+#define MEMCG_NR_GENS 2
|
||||
+#define MEMCG_NR_BINS 8
|
||||
+
|
||||
+struct lru_gen_memcg {
|
||||
+ /* the per-node memcg generation counter */
|
||||
+ unsigned long seq;
|
||||
+ /* each memcg has one lru_gen_page per node */
|
||||
+ unsigned long nr_memcgs[MEMCG_NR_GENS];
|
||||
+ /* per-node lru_gen_page list for global reclaim */
|
||||
+ struct hlist_nulls_head fifo[MEMCG_NR_GENS][MEMCG_NR_BINS];
|
||||
+ /* protects the above */
|
||||
+ spinlock_t lock;
|
||||
+};
|
||||
+
|
||||
+void lru_gen_init_pgdat(struct pglist_data *pgdat);
|
||||
+
|
||||
void lru_gen_init_memcg(struct mem_cgroup *memcg);
|
||||
void lru_gen_exit_memcg(struct mem_cgroup *memcg);
|
||||
-#endif
|
||||
+void lru_gen_online_memcg(struct mem_cgroup *memcg);
|
||||
+void lru_gen_offline_memcg(struct mem_cgroup *memcg);
|
||||
+void lru_gen_release_memcg(struct mem_cgroup *memcg);
|
||||
+void lru_gen_rotate_memcg(struct lruvec *lruvec, int op);
|
||||
+
|
||||
+#else /* !CONFIG_MEMCG */
|
||||
+
|
||||
+#define MEMCG_NR_GENS 1
|
||||
+
|
||||
+struct lru_gen_memcg {
|
||||
+};
|
||||
+
|
||||
+static inline void lru_gen_init_pgdat(struct pglist_data *pgdat)
|
||||
+{
|
||||
+}
|
||||
+
|
||||
+#endif /* CONFIG_MEMCG */
|
||||
|
||||
#else /* !CONFIG_LRU_GEN */
|
||||
|
||||
+static inline void lru_gen_init_pgdat(struct pglist_data *pgdat)
|
||||
+{
|
||||
+}
|
||||
+
|
||||
static inline void lru_gen_init_lruvec(struct lruvec *lruvec)
|
||||
{
|
||||
}
|
||||
@@ -484,6 +577,7 @@ static inline void lru_gen_look_around(s
|
||||
}
|
||||
|
||||
#ifdef CONFIG_MEMCG
|
||||
+
|
||||
static inline void lru_gen_init_memcg(struct mem_cgroup *memcg)
|
||||
{
|
||||
}
|
||||
@@ -491,7 +585,24 @@ static inline void lru_gen_init_memcg(st
|
||||
static inline void lru_gen_exit_memcg(struct mem_cgroup *memcg)
|
||||
{
|
||||
}
|
||||
-#endif
|
||||
+
|
||||
+static inline void lru_gen_online_memcg(struct mem_cgroup *memcg)
|
||||
+{
|
||||
+}
|
||||
+
|
||||
+static inline void lru_gen_offline_memcg(struct mem_cgroup *memcg)
|
||||
+{
|
||||
+}
|
||||
+
|
||||
+static inline void lru_gen_release_memcg(struct mem_cgroup *memcg)
|
||||
+{
|
||||
+}
|
||||
+
|
||||
+static inline void lru_gen_rotate_memcg(struct lruvec *lruvec, int op)
|
||||
+{
|
||||
+}
|
||||
+
|
||||
+#endif /* CONFIG_MEMCG */
|
||||
|
||||
#endif /* CONFIG_LRU_GEN */
|
||||
|
||||
@@ -1105,6 +1216,8 @@ typedef struct pglist_data {
|
||||
#ifdef CONFIG_LRU_GEN
|
||||
/* kswap mm walk data */
|
||||
struct lru_gen_mm_walk mm_walk;
|
||||
+ /* lru_gen_page list */
|
||||
+ struct lru_gen_memcg memcg_lru;
|
||||
#endif
|
||||
|
||||
ZONE_PADDING(_pad2_)
|
||||
--- a/mm/memcontrol.c
|
||||
+++ b/mm/memcontrol.c
|
||||
@@ -549,6 +549,16 @@ static void mem_cgroup_update_tree(struc
|
||||
struct mem_cgroup_per_node *mz;
|
||||
struct mem_cgroup_tree_per_node *mctz;
|
||||
|
||||
+ if (lru_gen_enabled()) {
|
||||
+ struct lruvec *lruvec = &mem_cgroup_page_nodeinfo(memcg, page)->lruvec;
|
||||
+
|
||||
+ /* see the comment on MEMCG_NR_GENS */
|
||||
+ if (soft_limit_excess(memcg) && lru_gen_memcg_seg(lruvec) != MEMCG_LRU_HEAD)
|
||||
+ lru_gen_rotate_memcg(lruvec, MEMCG_LRU_HEAD);
|
||||
+
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
mctz = soft_limit_tree_from_page(page);
|
||||
if (!mctz)
|
||||
return;
|
||||
@@ -3433,6 +3443,9 @@ unsigned long mem_cgroup_soft_limit_recl
|
||||
unsigned long excess;
|
||||
unsigned long nr_scanned;
|
||||
|
||||
+ if (lru_gen_enabled())
|
||||
+ return 0;
|
||||
+
|
||||
if (order > 0)
|
||||
return 0;
|
||||
|
||||
@@ -5321,6 +5334,7 @@ static int mem_cgroup_css_online(struct
|
||||
if (unlikely(mem_cgroup_is_root(memcg)))
|
||||
queue_delayed_work(system_unbound_wq, &stats_flush_dwork,
|
||||
2UL*HZ);
|
||||
+ lru_gen_online_memcg(memcg);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -5347,6 +5361,7 @@ static void mem_cgroup_css_offline(struc
|
||||
memcg_offline_kmem(memcg);
|
||||
reparent_shrinker_deferred(memcg);
|
||||
wb_memcg_offline(memcg);
|
||||
+ lru_gen_offline_memcg(memcg);
|
||||
|
||||
drain_all_stock(memcg);
|
||||
|
||||
@@ -5358,6 +5373,7 @@ static void mem_cgroup_css_released(stru
|
||||
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
|
||||
|
||||
invalidate_reclaim_iterators(memcg);
|
||||
+ lru_gen_release_memcg(memcg);
|
||||
}
|
||||
|
||||
static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
|
||||
--- a/mm/page_alloc.c
|
||||
+++ b/mm/page_alloc.c
|
||||
@@ -7661,6 +7661,7 @@ static void __init free_area_init_node(i
|
||||
pgdat_set_deferred_range(pgdat);
|
||||
|
||||
free_area_init_core(pgdat);
|
||||
+ lru_gen_init_pgdat(pgdat);
|
||||
}
|
||||
|
||||
void __init free_area_init_memoryless_node(int nid)
|
||||
--- a/mm/vmscan.c
|
||||
+++ b/mm/vmscan.c
|
||||
@@ -54,6 +54,8 @@
|
||||
#include <linux/shmem_fs.h>
|
||||
#include <linux/ctype.h>
|
||||
#include <linux/debugfs.h>
|
||||
+#include <linux/rculist_nulls.h>
|
||||
+#include <linux/random.h>
|
||||
|
||||
#include <asm/tlbflush.h>
|
||||
#include <asm/div64.h>
|
||||
@@ -129,11 +131,6 @@ struct scan_control {
|
||||
/* Always discard instead of demoting to lower tier memory */
|
||||
unsigned int no_demotion:1;
|
||||
|
||||
-#ifdef CONFIG_LRU_GEN
|
||||
- /* help kswapd make better choices among multiple memcgs */
|
||||
- unsigned long last_reclaimed;
|
||||
-#endif
|
||||
-
|
||||
/* Allocation order */
|
||||
s8 order;
|
||||
|
||||
@@ -2880,6 +2877,9 @@ DEFINE_STATIC_KEY_ARRAY_FALSE(lru_gen_ca
|
||||
for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \
|
||||
for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++)
|
||||
|
||||
+#define get_memcg_gen(seq) ((seq) % MEMCG_NR_GENS)
|
||||
+#define get_memcg_bin(bin) ((bin) % MEMCG_NR_BINS)
|
||||
+
|
||||
static struct lruvec *get_lruvec(struct mem_cgroup *memcg, int nid)
|
||||
{
|
||||
struct pglist_data *pgdat = NODE_DATA(nid);
|
||||
@@ -4169,8 +4169,7 @@ done:
|
||||
if (sc->priority <= DEF_PRIORITY - 2)
|
||||
wait_event_killable(lruvec->mm_state.wait,
|
||||
max_seq < READ_ONCE(lrugen->max_seq));
|
||||
-
|
||||
- return max_seq < READ_ONCE(lrugen->max_seq);
|
||||
+ return false;
|
||||
}
|
||||
|
||||
VM_WARN_ON_ONCE(max_seq != READ_ONCE(lrugen->max_seq));
|
||||
@@ -4243,8 +4242,6 @@ static void lru_gen_age_node(struct pgli
|
||||
|
||||
VM_WARN_ON_ONCE(!current_is_kswapd());
|
||||
|
||||
- sc->last_reclaimed = sc->nr_reclaimed;
|
||||
-
|
||||
/* check the order to exclude compaction-induced reclaim */
|
||||
if (!min_ttl || sc->order || sc->priority == DEF_PRIORITY)
|
||||
return;
|
||||
@@ -4833,8 +4830,7 @@ static bool should_run_aging(struct lruv
|
||||
* 1. Defer try_to_inc_max_seq() to workqueues to reduce latency for memcg
|
||||
* reclaim.
|
||||
*/
|
||||
-static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc,
|
||||
- bool can_swap)
|
||||
+static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool can_swap)
|
||||
{
|
||||
unsigned long nr_to_scan;
|
||||
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
|
||||
@@ -4851,10 +4847,8 @@ static unsigned long get_nr_to_scan(stru
|
||||
if (sc->priority == DEF_PRIORITY)
|
||||
return nr_to_scan;
|
||||
|
||||
- try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false);
|
||||
-
|
||||
/* skip this lruvec as it's low on cold pages */
|
||||
- return 0;
|
||||
+ return try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false) ? -1 : 0;
|
||||
}
|
||||
|
||||
static unsigned long get_nr_to_reclaim(struct scan_control *sc)
|
||||
@@ -4863,29 +4857,18 @@ static unsigned long get_nr_to_reclaim(s
|
||||
if (!global_reclaim(sc))
|
||||
return -1;
|
||||
|
||||
- /* discount the previous progress for kswapd */
|
||||
- if (current_is_kswapd())
|
||||
- return sc->nr_to_reclaim + sc->last_reclaimed;
|
||||
-
|
||||
return max(sc->nr_to_reclaim, compact_gap(sc->order));
|
||||
}
|
||||
|
||||
-static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
|
||||
+static bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
|
||||
{
|
||||
- struct blk_plug plug;
|
||||
+ long nr_to_scan;
|
||||
unsigned long scanned = 0;
|
||||
unsigned long nr_to_reclaim = get_nr_to_reclaim(sc);
|
||||
|
||||
- lru_add_drain();
|
||||
-
|
||||
- blk_start_plug(&plug);
|
||||
-
|
||||
- set_mm_walk(lruvec_pgdat(lruvec));
|
||||
-
|
||||
while (true) {
|
||||
int delta;
|
||||
int swappiness;
|
||||
- unsigned long nr_to_scan;
|
||||
|
||||
if (sc->may_swap)
|
||||
swappiness = get_swappiness(lruvec, sc);
|
||||
@@ -4895,7 +4878,7 @@ static void lru_gen_shrink_lruvec(struct
|
||||
swappiness = 0;
|
||||
|
||||
nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness);
|
||||
- if (!nr_to_scan)
|
||||
+ if (nr_to_scan <= 0)
|
||||
break;
|
||||
|
||||
delta = evict_pages(lruvec, sc, swappiness);
|
||||
@@ -4912,10 +4895,250 @@ static void lru_gen_shrink_lruvec(struct
|
||||
cond_resched();
|
||||
}
|
||||
|
||||
+ /* whether try_to_inc_max_seq() was successful */
|
||||
+ return nr_to_scan < 0;
|
||||
+}
|
||||
+
|
||||
+static int shrink_one(struct lruvec *lruvec, struct scan_control *sc)
|
||||
+{
|
||||
+ bool success;
|
||||
+ unsigned long scanned = sc->nr_scanned;
|
||||
+ unsigned long reclaimed = sc->nr_reclaimed;
|
||||
+ int seg = lru_gen_memcg_seg(lruvec);
|
||||
+ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
|
||||
+ struct pglist_data *pgdat = lruvec_pgdat(lruvec);
|
||||
+
|
||||
+ /* see the comment on MEMCG_NR_GENS */
|
||||
+ if (!lruvec_is_sizable(lruvec, sc))
|
||||
+ return seg != MEMCG_LRU_TAIL ? MEMCG_LRU_TAIL : MEMCG_LRU_YOUNG;
|
||||
+
|
||||
+ mem_cgroup_calculate_protection(NULL, memcg);
|
||||
+
|
||||
+ if (mem_cgroup_below_min(memcg))
|
||||
+ return MEMCG_LRU_YOUNG;
|
||||
+
|
||||
+ if (mem_cgroup_below_low(memcg)) {
|
||||
+ /* see the comment on MEMCG_NR_GENS */
|
||||
+ if (seg != MEMCG_LRU_TAIL)
|
||||
+ return MEMCG_LRU_TAIL;
|
||||
+
|
||||
+ memcg_memory_event(memcg, MEMCG_LOW);
|
||||
+ }
|
||||
+
|
||||
+ success = try_to_shrink_lruvec(lruvec, sc);
|
||||
+
|
||||
+ shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, sc->priority);
|
||||
+
|
||||
+ vmpressure(sc->gfp_mask, memcg, false, sc->nr_scanned - scanned,
|
||||
+ sc->nr_reclaimed - reclaimed);
|
||||
+
|
||||
+ sc->nr_reclaimed += current->reclaim_state->reclaimed_slab;
|
||||
+ current->reclaim_state->reclaimed_slab = 0;
|
||||
+
|
||||
+ return success ? MEMCG_LRU_YOUNG : 0;
|
||||
+}
|
||||
+
|
||||
+#ifdef CONFIG_MEMCG
|
||||
+
|
||||
+static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc)
|
||||
+{
|
||||
+ int gen;
|
||||
+ int bin;
|
||||
+ int first_bin;
|
||||
+ struct lruvec *lruvec;
|
||||
+ struct lru_gen_page *lrugen;
|
||||
+ const struct hlist_nulls_node *pos;
|
||||
+ int op = 0;
|
||||
+ struct mem_cgroup *memcg = NULL;
|
||||
+ unsigned long nr_to_reclaim = get_nr_to_reclaim(sc);
|
||||
+
|
||||
+ bin = first_bin = prandom_u32_max(MEMCG_NR_BINS);
|
||||
+restart:
|
||||
+ gen = get_memcg_gen(READ_ONCE(pgdat->memcg_lru.seq));
|
||||
+
|
||||
+ rcu_read_lock();
|
||||
+
|
||||
+ hlist_nulls_for_each_entry_rcu(lrugen, pos, &pgdat->memcg_lru.fifo[gen][bin], list) {
|
||||
+ if (op)
|
||||
+ lru_gen_rotate_memcg(lruvec, op);
|
||||
+
|
||||
+ mem_cgroup_put(memcg);
|
||||
+
|
||||
+ lruvec = container_of(lrugen, struct lruvec, lrugen);
|
||||
+ memcg = lruvec_memcg(lruvec);
|
||||
+
|
||||
+ if (!mem_cgroup_tryget(memcg)) {
|
||||
+ op = 0;
|
||||
+ memcg = NULL;
|
||||
+ continue;
|
||||
+ }
|
||||
+
|
||||
+ rcu_read_unlock();
|
||||
+
|
||||
+ op = shrink_one(lruvec, sc);
|
||||
+
|
||||
+ if (sc->nr_reclaimed >= nr_to_reclaim)
|
||||
+ goto success;
|
||||
+
|
||||
+ rcu_read_lock();
|
||||
+ }
|
||||
+
|
||||
+ rcu_read_unlock();
|
||||
+
|
||||
+ /* restart if raced with lru_gen_rotate_memcg() */
|
||||
+ if (gen != get_nulls_value(pos))
|
||||
+ goto restart;
|
||||
+
|
||||
+ /* try the rest of the bins of the current generation */
|
||||
+ bin = get_memcg_bin(bin + 1);
|
||||
+ if (bin != first_bin)
|
||||
+ goto restart;
|
||||
+success:
|
||||
+ if (op)
|
||||
+ lru_gen_rotate_memcg(lruvec, op);
|
||||
+
|
||||
+ mem_cgroup_put(memcg);
|
||||
+}
|
||||
+
|
||||
+static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
|
||||
+{
|
||||
+ struct blk_plug plug;
|
||||
+
|
||||
+ VM_WARN_ON_ONCE(global_reclaim(sc));
|
||||
+
|
||||
+ lru_add_drain();
|
||||
+
|
||||
+ blk_start_plug(&plug);
|
||||
+
|
||||
+ set_mm_walk(lruvec_pgdat(lruvec));
|
||||
+
|
||||
+ if (try_to_shrink_lruvec(lruvec, sc))
|
||||
+ lru_gen_rotate_memcg(lruvec, MEMCG_LRU_YOUNG);
|
||||
+
|
||||
+ clear_mm_walk();
|
||||
+
|
||||
+ blk_finish_plug(&plug);
|
||||
+}
|
||||
+
|
||||
+#else /* !CONFIG_MEMCG */
|
||||
+
|
||||
+static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc)
|
||||
+{
|
||||
+ BUILD_BUG();
|
||||
+}
|
||||
+
|
||||
+static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
|
||||
+{
|
||||
+ BUILD_BUG();
|
||||
+}
|
||||
+
|
||||
+#endif
|
||||
+
|
||||
+static void set_initial_priority(struct pglist_data *pgdat, struct scan_control *sc)
|
||||
+{
|
||||
+ int priority;
|
||||
+ unsigned long reclaimable;
|
||||
+ struct lruvec *lruvec = mem_cgroup_lruvec(NULL, pgdat);
|
||||
+
|
||||
+ if (sc->priority != DEF_PRIORITY || sc->nr_to_reclaim < MIN_LRU_BATCH)
|
||||
+ return;
|
||||
+ /*
|
||||
+ * Determine the initial priority based on ((total / MEMCG_NR_GENS) >>
|
||||
+ * priority) * reclaimed_to_scanned_ratio = nr_to_reclaim, where the
|
||||
+ * estimated reclaimed_to_scanned_ratio = inactive / total.
|
||||
+ */
|
||||
+ reclaimable = node_page_state(pgdat, NR_INACTIVE_FILE);
|
||||
+ if (get_swappiness(lruvec, sc))
|
||||
+ reclaimable += node_page_state(pgdat, NR_INACTIVE_ANON);
|
||||
+
|
||||
+ reclaimable /= MEMCG_NR_GENS;
|
||||
+
|
||||
+ /* round down reclaimable and round up sc->nr_to_reclaim */
|
||||
+ priority = fls_long(reclaimable) - 1 - fls_long(sc->nr_to_reclaim - 1);
|
||||
+
|
||||
+ sc->priority = clamp(priority, 0, DEF_PRIORITY);
|
||||
+}
|
||||
+
|
||||
+static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *sc)
|
||||
+{
|
||||
+ struct blk_plug plug;
|
||||
+ unsigned long reclaimed = sc->nr_reclaimed;
|
||||
+
|
||||
+ VM_WARN_ON_ONCE(!global_reclaim(sc));
|
||||
+
|
||||
+ lru_add_drain();
|
||||
+
|
||||
+ blk_start_plug(&plug);
|
||||
+
|
||||
+ set_mm_walk(pgdat);
|
||||
+
|
||||
+ set_initial_priority(pgdat, sc);
|
||||
+
|
||||
+ if (current_is_kswapd())
|
||||
+ sc->nr_reclaimed = 0;
|
||||
+
|
||||
+ if (mem_cgroup_disabled())
|
||||
+ shrink_one(&pgdat->__lruvec, sc);
|
||||
+ else
|
||||
+ shrink_many(pgdat, sc);
|
||||
+
|
||||
+ if (current_is_kswapd())
|
||||
+ sc->nr_reclaimed += reclaimed;
|
||||
+
|
||||
clear_mm_walk();
|
||||
|
||||
blk_finish_plug(&plug);
|
||||
+
|
||||
+ /* kswapd should never fail */
|
||||
+ pgdat->kswapd_failures = 0;
|
||||
+}
|
||||
+
|
||||
+#ifdef CONFIG_MEMCG
|
||||
+void lru_gen_rotate_memcg(struct lruvec *lruvec, int op)
|
||||
+{
|
||||
+ int seg;
|
||||
+ int old, new;
|
||||
+ int bin = prandom_u32_max(MEMCG_NR_BINS);
|
||||
+ struct pglist_data *pgdat = lruvec_pgdat(lruvec);
|
||||
+
|
||||
+ spin_lock(&pgdat->memcg_lru.lock);
|
||||
+
|
||||
+ VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list));
|
||||
+
|
||||
+ seg = 0;
|
||||
+ new = old = lruvec->lrugen.gen;
|
||||
+
|
||||
+ /* see the comment on MEMCG_NR_GENS */
|
||||
+ if (op == MEMCG_LRU_HEAD)
|
||||
+ seg = MEMCG_LRU_HEAD;
|
||||
+ else if (op == MEMCG_LRU_TAIL)
|
||||
+ seg = MEMCG_LRU_TAIL;
|
||||
+ else if (op == MEMCG_LRU_OLD)
|
||||
+ new = get_memcg_gen(pgdat->memcg_lru.seq);
|
||||
+ else if (op == MEMCG_LRU_YOUNG)
|
||||
+ new = get_memcg_gen(pgdat->memcg_lru.seq + 1);
|
||||
+ else
|
||||
+ VM_WARN_ON_ONCE(true);
|
||||
+
|
||||
+ hlist_nulls_del_rcu(&lruvec->lrugen.list);
|
||||
+
|
||||
+ if (op == MEMCG_LRU_HEAD || op == MEMCG_LRU_OLD)
|
||||
+ hlist_nulls_add_head_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]);
|
||||
+ else
|
||||
+ hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]);
|
||||
+
|
||||
+ pgdat->memcg_lru.nr_memcgs[old]--;
|
||||
+ pgdat->memcg_lru.nr_memcgs[new]++;
|
||||
+
|
||||
+ lruvec->lrugen.gen = new;
|
||||
+ WRITE_ONCE(lruvec->lrugen.seg, seg);
|
||||
+
|
||||
+ if (!pgdat->memcg_lru.nr_memcgs[old] && old == get_memcg_gen(pgdat->memcg_lru.seq))
|
||||
+ WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1);
|
||||
+
|
||||
+ spin_unlock(&pgdat->memcg_lru.lock);
|
||||
}
|
||||
+#endif
|
||||
|
||||
/******************************************************************************
|
||||
* state change
|
||||
@@ -5370,11 +5593,11 @@ static int run_cmd(char cmd, int memcg_i
|
||||
|
||||
if (!mem_cgroup_disabled()) {
|
||||
rcu_read_lock();
|
||||
+
|
||||
memcg = mem_cgroup_from_id(memcg_id);
|
||||
-#ifdef CONFIG_MEMCG
|
||||
- if (memcg && !css_tryget(&memcg->css))
|
||||
+ if (!mem_cgroup_tryget(memcg))
|
||||
memcg = NULL;
|
||||
-#endif
|
||||
+
|
||||
rcu_read_unlock();
|
||||
|
||||
if (!memcg)
|
||||
@@ -5521,6 +5744,19 @@ void lru_gen_init_lruvec(struct lruvec *
|
||||
}
|
||||
|
||||
#ifdef CONFIG_MEMCG
|
||||
+
|
||||
+void lru_gen_init_pgdat(struct pglist_data *pgdat)
|
||||
+{
|
||||
+ int i, j;
|
||||
+
|
||||
+ spin_lock_init(&pgdat->memcg_lru.lock);
|
||||
+
|
||||
+ for (i = 0; i < MEMCG_NR_GENS; i++) {
|
||||
+ for (j = 0; j < MEMCG_NR_BINS; j++)
|
||||
+ INIT_HLIST_NULLS_HEAD(&pgdat->memcg_lru.fifo[i][j], i);
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
void lru_gen_init_memcg(struct mem_cgroup *memcg)
|
||||
{
|
||||
INIT_LIST_HEAD(&memcg->mm_list.fifo);
|
||||
@@ -5544,7 +5780,69 @@ void lru_gen_exit_memcg(struct mem_cgrou
|
||||
}
|
||||
}
|
||||
}
|
||||
-#endif
|
||||
+
|
||||
+void lru_gen_online_memcg(struct mem_cgroup *memcg)
|
||||
+{
|
||||
+ int gen;
|
||||
+ int nid;
|
||||
+ int bin = prandom_u32_max(MEMCG_NR_BINS);
|
||||
+
|
||||
+ for_each_node(nid) {
|
||||
+ struct pglist_data *pgdat = NODE_DATA(nid);
|
||||
+ struct lruvec *lruvec = get_lruvec(memcg, nid);
|
||||
+
|
||||
+ spin_lock(&pgdat->memcg_lru.lock);
|
||||
+
|
||||
+ VM_WARN_ON_ONCE(!hlist_nulls_unhashed(&lruvec->lrugen.list));
|
||||
+
|
||||
+ gen = get_memcg_gen(pgdat->memcg_lru.seq);
|
||||
+
|
||||
+ hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[gen][bin]);
|
||||
+ pgdat->memcg_lru.nr_memcgs[gen]++;
|
||||
+
|
||||
+ lruvec->lrugen.gen = gen;
|
||||
+
|
||||
+ spin_unlock(&pgdat->memcg_lru.lock);
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
+void lru_gen_offline_memcg(struct mem_cgroup *memcg)
|
||||
+{
|
||||
+ int nid;
|
||||
+
|
||||
+ for_each_node(nid) {
|
||||
+ struct lruvec *lruvec = get_lruvec(memcg, nid);
|
||||
+
|
||||
+ lru_gen_rotate_memcg(lruvec, MEMCG_LRU_OLD);
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
+void lru_gen_release_memcg(struct mem_cgroup *memcg)
|
||||
+{
|
||||
+ int gen;
|
||||
+ int nid;
|
||||
+
|
||||
+ for_each_node(nid) {
|
||||
+ struct pglist_data *pgdat = NODE_DATA(nid);
|
||||
+ struct lruvec *lruvec = get_lruvec(memcg, nid);
|
||||
+
|
||||
+ spin_lock(&pgdat->memcg_lru.lock);
|
||||
+
|
||||
+ VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list));
|
||||
+
|
||||
+ gen = lruvec->lrugen.gen;
|
||||
+
|
||||
+ hlist_nulls_del_rcu(&lruvec->lrugen.list);
|
||||
+ pgdat->memcg_lru.nr_memcgs[gen]--;
|
||||
+
|
||||
+ if (!pgdat->memcg_lru.nr_memcgs[gen] && gen == get_memcg_gen(pgdat->memcg_lru.seq))
|
||||
+ WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1);
|
||||
+
|
||||
+ spin_unlock(&pgdat->memcg_lru.lock);
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
+#endif /* CONFIG_MEMCG */
|
||||
|
||||
static int __init init_lru_gen(void)
|
||||
{
|
||||
@@ -5571,6 +5869,10 @@ static void lru_gen_shrink_lruvec(struct
|
||||
{
|
||||
}
|
||||
|
||||
+static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *sc)
|
||||
+{
|
||||
+}
|
||||
+
|
||||
#endif /* CONFIG_LRU_GEN */
|
||||
|
||||
static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
|
||||
@@ -5584,7 +5886,7 @@ static void shrink_lruvec(struct lruvec
|
||||
bool proportional_reclaim;
|
||||
struct blk_plug plug;
|
||||
|
||||
- if (lru_gen_enabled()) {
|
||||
+ if (lru_gen_enabled() && !global_reclaim(sc)) {
|
||||
lru_gen_shrink_lruvec(lruvec, sc);
|
||||
return;
|
||||
}
|
||||
@@ -5826,6 +6128,11 @@ static void shrink_node(pg_data_t *pgdat
|
||||
struct lruvec *target_lruvec;
|
||||
bool reclaimable = false;
|
||||
|
||||
+ if (lru_gen_enabled() && global_reclaim(sc)) {
|
||||
+ lru_gen_shrink_node(pgdat, sc);
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
|
||||
|
||||
again:
|
@ -0,0 +1,196 @@
|
||||
From 93147736b5b3a21bea24313bfc7a696829932009 Mon Sep 17 00:00:00 2001
|
||||
From: Yu Zhao <yuzhao@google.com>
|
||||
Date: Wed, 21 Dec 2022 21:19:05 -0700
|
||||
Subject: [PATCH 27/29] mm: multi-gen LRU: clarify scan_control flags
|
||||
|
||||
Among the flags in scan_control:
|
||||
1. sc->may_swap, which indicates swap constraint due to memsw.max, is
|
||||
supported as usual.
|
||||
2. sc->proactive, which indicates reclaim by memory.reclaim, may not
|
||||
opportunistically skip the aging path, since it is considered less
|
||||
latency sensitive.
|
||||
3. !(sc->gfp_mask & __GFP_IO), which indicates IO constraint, lowers
|
||||
swappiness to prioritize file LRU, since clean file pages are more
|
||||
likely to exist.
|
||||
4. sc->may_writepage and sc->may_unmap, which indicates opportunistic
|
||||
reclaim, are rejected, since unmapped clean pages are already
|
||||
prioritized. Scanning for more of them is likely futile and can
|
||||
cause high reclaim latency when there is a large number of memcgs.
|
||||
|
||||
The rest are handled by the existing code.
|
||||
|
||||
Link: https://lkml.kernel.org/r/20221222041905.2431096-8-yuzhao@google.com
|
||||
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
||||
Cc: Johannes Weiner <hannes@cmpxchg.org>
|
||||
Cc: Jonathan Corbet <corbet@lwn.net>
|
||||
Cc: Michael Larabel <Michael@MichaelLarabel.com>
|
||||
Cc: Michal Hocko <mhocko@kernel.org>
|
||||
Cc: Mike Rapoport <rppt@kernel.org>
|
||||
Cc: Roman Gushchin <roman.gushchin@linux.dev>
|
||||
Cc: Suren Baghdasaryan <surenb@google.com>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
mm/vmscan.c | 55 +++++++++++++++++++++++++++--------------------------
|
||||
1 file changed, 28 insertions(+), 27 deletions(-)
|
||||
|
||||
--- a/mm/vmscan.c
|
||||
+++ b/mm/vmscan.c
|
||||
@@ -2905,6 +2905,9 @@ static int get_swappiness(struct lruvec
|
||||
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
|
||||
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
|
||||
|
||||
+ if (!sc->may_swap)
|
||||
+ return 0;
|
||||
+
|
||||
if (!can_demote(pgdat->node_id, sc) &&
|
||||
mem_cgroup_get_nr_swap_pages(memcg) < MIN_LRU_BATCH)
|
||||
return 0;
|
||||
@@ -3952,7 +3955,7 @@ static void walk_mm(struct lruvec *lruve
|
||||
} while (err == -EAGAIN);
|
||||
}
|
||||
|
||||
-static struct lru_gen_mm_walk *set_mm_walk(struct pglist_data *pgdat)
|
||||
+static struct lru_gen_mm_walk *set_mm_walk(struct pglist_data *pgdat, bool force_alloc)
|
||||
{
|
||||
struct lru_gen_mm_walk *walk = current->reclaim_state->mm_walk;
|
||||
|
||||
@@ -3960,7 +3963,7 @@ static struct lru_gen_mm_walk *set_mm_wa
|
||||
VM_WARN_ON_ONCE(walk);
|
||||
|
||||
walk = &pgdat->mm_walk;
|
||||
- } else if (!pgdat && !walk) {
|
||||
+ } else if (!walk && force_alloc) {
|
||||
VM_WARN_ON_ONCE(current_is_kswapd());
|
||||
|
||||
walk = kzalloc(sizeof(*walk), __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN);
|
||||
@@ -4146,7 +4149,7 @@ static bool try_to_inc_max_seq(struct lr
|
||||
goto done;
|
||||
}
|
||||
|
||||
- walk = set_mm_walk(NULL);
|
||||
+ walk = set_mm_walk(NULL, true);
|
||||
if (!walk) {
|
||||
success = iterate_mm_list_nowalk(lruvec, max_seq);
|
||||
goto done;
|
||||
@@ -4215,8 +4218,6 @@ static bool lruvec_is_reclaimable(struct
|
||||
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
|
||||
DEFINE_MIN_SEQ(lruvec);
|
||||
|
||||
- VM_WARN_ON_ONCE(sc->memcg_low_reclaim);
|
||||
-
|
||||
/* see the comment on lru_gen_page */
|
||||
gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]);
|
||||
birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);
|
||||
@@ -4472,12 +4473,8 @@ static bool isolate_page(struct lruvec *
|
||||
{
|
||||
bool success;
|
||||
|
||||
- /* unmapping inhibited */
|
||||
- if (!sc->may_unmap && page_mapped(page))
|
||||
- return false;
|
||||
-
|
||||
/* swapping inhibited */
|
||||
- if (!(sc->may_writepage && (sc->gfp_mask & __GFP_IO)) &&
|
||||
+ if (!(sc->gfp_mask & __GFP_IO) &&
|
||||
(PageDirty(page) ||
|
||||
(PageAnon(page) && !PageSwapCache(page))))
|
||||
return false;
|
||||
@@ -4574,9 +4571,8 @@ static int scan_pages(struct lruvec *lru
|
||||
__count_vm_events(PGSCAN_ANON + type, isolated);
|
||||
|
||||
/*
|
||||
- * There might not be eligible pages due to reclaim_idx, may_unmap and
|
||||
- * may_writepage. Check the remaining to prevent livelock if it's not
|
||||
- * making progress.
|
||||
+ * There might not be eligible pages due to reclaim_idx. Check the
|
||||
+ * remaining to prevent livelock if it's not making progress.
|
||||
*/
|
||||
return isolated || !remaining ? scanned : 0;
|
||||
}
|
||||
@@ -4836,8 +4832,7 @@ static long get_nr_to_scan(struct lruvec
|
||||
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
|
||||
DEFINE_MAX_SEQ(lruvec);
|
||||
|
||||
- if (mem_cgroup_below_min(memcg) ||
|
||||
- (mem_cgroup_below_low(memcg) && !sc->memcg_low_reclaim))
|
||||
+ if (mem_cgroup_below_min(memcg))
|
||||
return 0;
|
||||
|
||||
if (!should_run_aging(lruvec, max_seq, sc, can_swap, &nr_to_scan))
|
||||
@@ -4865,17 +4860,14 @@ static bool try_to_shrink_lruvec(struct
|
||||
long nr_to_scan;
|
||||
unsigned long scanned = 0;
|
||||
unsigned long nr_to_reclaim = get_nr_to_reclaim(sc);
|
||||
+ int swappiness = get_swappiness(lruvec, sc);
|
||||
+
|
||||
+ /* clean file pages are more likely to exist */
|
||||
+ if (swappiness && !(sc->gfp_mask & __GFP_IO))
|
||||
+ swappiness = 1;
|
||||
|
||||
while (true) {
|
||||
int delta;
|
||||
- int swappiness;
|
||||
-
|
||||
- if (sc->may_swap)
|
||||
- swappiness = get_swappiness(lruvec, sc);
|
||||
- else if (!cgroup_reclaim(sc) && get_swappiness(lruvec, sc))
|
||||
- swappiness = 1;
|
||||
- else
|
||||
- swappiness = 0;
|
||||
|
||||
nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness);
|
||||
if (nr_to_scan <= 0)
|
||||
@@ -5005,12 +4997,13 @@ static void lru_gen_shrink_lruvec(struct
|
||||
struct blk_plug plug;
|
||||
|
||||
VM_WARN_ON_ONCE(global_reclaim(sc));
|
||||
+ VM_WARN_ON_ONCE(!sc->may_writepage || !sc->may_unmap);
|
||||
|
||||
lru_add_drain();
|
||||
|
||||
blk_start_plug(&plug);
|
||||
|
||||
- set_mm_walk(lruvec_pgdat(lruvec));
|
||||
+ set_mm_walk(NULL, false);
|
||||
|
||||
if (try_to_shrink_lruvec(lruvec, sc))
|
||||
lru_gen_rotate_memcg(lruvec, MEMCG_LRU_YOUNG);
|
||||
@@ -5066,11 +5059,19 @@ static void lru_gen_shrink_node(struct p
|
||||
|
||||
VM_WARN_ON_ONCE(!global_reclaim(sc));
|
||||
|
||||
+ /*
|
||||
+ * Unmapped clean pages are already prioritized. Scanning for more of
|
||||
+ * them is likely futile and can cause high reclaim latency when there
|
||||
+ * is a large number of memcgs.
|
||||
+ */
|
||||
+ if (!sc->may_writepage || !sc->may_unmap)
|
||||
+ goto done;
|
||||
+
|
||||
lru_add_drain();
|
||||
|
||||
blk_start_plug(&plug);
|
||||
|
||||
- set_mm_walk(pgdat);
|
||||
+ set_mm_walk(pgdat, false);
|
||||
|
||||
set_initial_priority(pgdat, sc);
|
||||
|
||||
@@ -5088,7 +5089,7 @@ static void lru_gen_shrink_node(struct p
|
||||
clear_mm_walk();
|
||||
|
||||
blk_finish_plug(&plug);
|
||||
-
|
||||
+done:
|
||||
/* kswapd should never fail */
|
||||
pgdat->kswapd_failures = 0;
|
||||
}
|
||||
@@ -5656,7 +5657,7 @@ static ssize_t lru_gen_seq_write(struct
|
||||
set_task_reclaim_state(current, &sc.reclaim_state);
|
||||
flags = memalloc_noreclaim_save();
|
||||
blk_start_plug(&plug);
|
||||
- if (!set_mm_walk(NULL)) {
|
||||
+ if (!set_mm_walk(NULL, true)) {
|
||||
err = -ENOMEM;
|
||||
goto done;
|
||||
}
|
@ -0,0 +1,34 @@
|
||||
From cf3297e4c7a928da8b2b2f0baff2f9c69ea57952 Mon Sep 17 00:00:00 2001
|
||||
From: Yu Zhao <yuzhao@google.com>
|
||||
Date: Wed, 21 Dec 2022 21:19:06 -0700
|
||||
Subject: [PATCH 28/29] mm: multi-gen LRU: simplify arch_has_hw_pte_young()
|
||||
check
|
||||
|
||||
Scanning page tables when hardware does not set the accessed bit has
|
||||
no real use cases.
|
||||
|
||||
Link: https://lkml.kernel.org/r/20221222041905.2431096-9-yuzhao@google.com
|
||||
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
||||
Cc: Johannes Weiner <hannes@cmpxchg.org>
|
||||
Cc: Jonathan Corbet <corbet@lwn.net>
|
||||
Cc: Michael Larabel <Michael@MichaelLarabel.com>
|
||||
Cc: Michal Hocko <mhocko@kernel.org>
|
||||
Cc: Mike Rapoport <rppt@kernel.org>
|
||||
Cc: Roman Gushchin <roman.gushchin@linux.dev>
|
||||
Cc: Suren Baghdasaryan <surenb@google.com>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
mm/vmscan.c | 2 +-
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
--- a/mm/vmscan.c
|
||||
+++ b/mm/vmscan.c
|
||||
@@ -4144,7 +4144,7 @@ static bool try_to_inc_max_seq(struct lr
|
||||
* handful of PTEs. Spreading the work out over a period of time usually
|
||||
* is less efficient, but it avoids bursty page faults.
|
||||
*/
|
||||
- if (!force_scan && !(arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK))) {
|
||||
+ if (!arch_has_hw_pte_young() || !get_cap(LRU_GEN_MM_WALK)) {
|
||||
success = iterate_mm_list_nowalk(lruvec, max_seq);
|
||||
goto done;
|
||||
}
|
@ -0,0 +1,88 @@
|
||||
From cc67f962cc53f6e1dfa92eb85b7b26fe83a3c66f Mon Sep 17 00:00:00 2001
|
||||
From: Yu Zhao <yuzhao@google.com>
|
||||
Date: Mon, 13 Feb 2023 00:53:22 -0700
|
||||
Subject: [PATCH 29/29] mm: multi-gen LRU: avoid futile retries
|
||||
|
||||
Recall that the per-node memcg LRU has two generations and they alternate
|
||||
when the last memcg (of a given node) is moved from one to the other.
|
||||
Each generation is also sharded into multiple bins to improve scalability.
|
||||
A reclaimer starts with a random bin (in the old generation) and, if it
|
||||
fails, it will retry, i.e., to try the rest of the bins.
|
||||
|
||||
If a reclaimer fails with the last memcg, it should move this memcg to the
|
||||
young generation first, which causes the generations to alternate, and
|
||||
then retry. Otherwise, the retries will be futile because all other bins
|
||||
are empty.
|
||||
|
||||
Link: https://lkml.kernel.org/r/20230213075322.1416966-1-yuzhao@google.com
|
||||
Fixes: e4dde56cd208 ("mm: multi-gen LRU: per-node lru_gen_folio lists")
|
||||
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
||||
Reported-by: T.J. Mercier <tjmercier@google.com>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
mm/vmscan.c | 25 +++++++++++++++----------
|
||||
1 file changed, 15 insertions(+), 10 deletions(-)
|
||||
|
||||
--- a/mm/vmscan.c
|
||||
+++ b/mm/vmscan.c
|
||||
@@ -4934,18 +4934,20 @@ static int shrink_one(struct lruvec *lru
|
||||
|
||||
static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc)
|
||||
{
|
||||
+ int op;
|
||||
int gen;
|
||||
int bin;
|
||||
int first_bin;
|
||||
struct lruvec *lruvec;
|
||||
struct lru_gen_page *lrugen;
|
||||
+ struct mem_cgroup *memcg;
|
||||
const struct hlist_nulls_node *pos;
|
||||
- int op = 0;
|
||||
- struct mem_cgroup *memcg = NULL;
|
||||
unsigned long nr_to_reclaim = get_nr_to_reclaim(sc);
|
||||
|
||||
bin = first_bin = prandom_u32_max(MEMCG_NR_BINS);
|
||||
restart:
|
||||
+ op = 0;
|
||||
+ memcg = NULL;
|
||||
gen = get_memcg_gen(READ_ONCE(pgdat->memcg_lru.seq));
|
||||
|
||||
rcu_read_lock();
|
||||
@@ -4969,14 +4971,22 @@ restart:
|
||||
|
||||
op = shrink_one(lruvec, sc);
|
||||
|
||||
- if (sc->nr_reclaimed >= nr_to_reclaim)
|
||||
- goto success;
|
||||
-
|
||||
rcu_read_lock();
|
||||
+
|
||||
+ if (sc->nr_reclaimed >= nr_to_reclaim)
|
||||
+ break;
|
||||
}
|
||||
|
||||
rcu_read_unlock();
|
||||
|
||||
+ if (op)
|
||||
+ lru_gen_rotate_memcg(lruvec, op);
|
||||
+
|
||||
+ mem_cgroup_put(memcg);
|
||||
+
|
||||
+ if (sc->nr_reclaimed >= nr_to_reclaim)
|
||||
+ return;
|
||||
+
|
||||
/* restart if raced with lru_gen_rotate_memcg() */
|
||||
if (gen != get_nulls_value(pos))
|
||||
goto restart;
|
||||
@@ -4985,11 +4995,6 @@ restart:
|
||||
bin = get_memcg_bin(bin + 1);
|
||||
if (bin != first_bin)
|
||||
goto restart;
|
||||
-success:
|
||||
- if (op)
|
||||
- lru_gen_rotate_memcg(lruvec, op);
|
||||
-
|
||||
- mem_cgroup_put(memcg);
|
||||
}
|
||||
|
||||
static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
|
@ -0,0 +1,65 @@
|
||||
From: Johan Almbladh <johan.almbladh@anyfinetworks.com>
|
||||
Date: Tue, 5 Oct 2021 18:54:02 +0200
|
||||
Subject: [PATCH] MIPS: uasm: Enable muhu opcode for MIPS R6
|
||||
|
||||
Enable the 'muhu' instruction, complementing the existing 'mulu', needed
|
||||
to implement a MIPS32 BPF JIT.
|
||||
|
||||
Also fix a typo in the existing definition of 'dmulu'.
|
||||
|
||||
Signed-off-by: Tony Ambardar <Tony.Ambardar@gmail.com>
|
||||
|
||||
This patch is a dependency for my 32-bit MIPS eBPF JIT.
|
||||
|
||||
Signed-off-by: Johan Almbladh <johan.almbladh@anyfinetworks.com>
|
||||
---
|
||||
|
||||
--- a/arch/mips/include/asm/uasm.h
|
||||
+++ b/arch/mips/include/asm/uasm.h
|
||||
@@ -145,6 +145,7 @@ Ip_u1(_mtlo);
|
||||
Ip_u3u1u2(_mul);
|
||||
Ip_u1u2(_multu);
|
||||
Ip_u3u1u2(_mulu);
|
||||
+Ip_u3u1u2(_muhu);
|
||||
Ip_u3u1u2(_nor);
|
||||
Ip_u3u1u2(_or);
|
||||
Ip_u2u1u3(_ori);
|
||||
--- a/arch/mips/mm/uasm-mips.c
|
||||
+++ b/arch/mips/mm/uasm-mips.c
|
||||
@@ -90,7 +90,7 @@ static const struct insn insn_table[insn
|
||||
RS | RT | RD},
|
||||
[insn_dmtc0] = {M(cop0_op, dmtc_op, 0, 0, 0, 0), RT | RD | SET},
|
||||
[insn_dmultu] = {M(spec_op, 0, 0, 0, 0, dmultu_op), RS | RT},
|
||||
- [insn_dmulu] = {M(spec_op, 0, 0, 0, dmult_dmul_op, dmultu_op),
|
||||
+ [insn_dmulu] = {M(spec_op, 0, 0, 0, dmultu_dmulu_op, dmultu_op),
|
||||
RS | RT | RD},
|
||||
[insn_drotr] = {M(spec_op, 1, 0, 0, 0, dsrl_op), RT | RD | RE},
|
||||
[insn_drotr32] = {M(spec_op, 1, 0, 0, 0, dsrl32_op), RT | RD | RE},
|
||||
@@ -150,6 +150,8 @@ static const struct insn insn_table[insn
|
||||
[insn_mtlo] = {M(spec_op, 0, 0, 0, 0, mtlo_op), RS},
|
||||
[insn_mulu] = {M(spec_op, 0, 0, 0, multu_mulu_op, multu_op),
|
||||
RS | RT | RD},
|
||||
+ [insn_muhu] = {M(spec_op, 0, 0, 0, multu_muhu_op, multu_op),
|
||||
+ RS | RT | RD},
|
||||
#ifndef CONFIG_CPU_MIPSR6
|
||||
[insn_mul] = {M(spec2_op, 0, 0, 0, 0, mul_op), RS | RT | RD},
|
||||
#else
|
||||
--- a/arch/mips/mm/uasm.c
|
||||
+++ b/arch/mips/mm/uasm.c
|
||||
@@ -59,7 +59,7 @@ enum opcode {
|
||||
insn_lddir, insn_ldpte, insn_ldx, insn_lh, insn_lhu, insn_ll, insn_lld,
|
||||
insn_lui, insn_lw, insn_lwu, insn_lwx, insn_mfc0, insn_mfhc0, insn_mfhi,
|
||||
insn_mflo, insn_modu, insn_movn, insn_movz, insn_mtc0, insn_mthc0,
|
||||
- insn_mthi, insn_mtlo, insn_mul, insn_multu, insn_mulu, insn_nor,
|
||||
+ insn_mthi, insn_mtlo, insn_mul, insn_multu, insn_mulu, insn_muhu, insn_nor,
|
||||
insn_or, insn_ori, insn_pref, insn_rfe, insn_rotr, insn_sb, insn_sc,
|
||||
insn_scd, insn_seleqz, insn_selnez, insn_sd, insn_sh, insn_sll,
|
||||
insn_sllv, insn_slt, insn_slti, insn_sltiu, insn_sltu, insn_sra,
|
||||
@@ -344,6 +344,7 @@ I_u1(_mtlo)
|
||||
I_u3u1u2(_mul)
|
||||
I_u1u2(_multu)
|
||||
I_u3u1u2(_mulu)
|
||||
+I_u3u1u2(_muhu)
|
||||
I_u3u1u2(_nor)
|
||||
I_u3u1u2(_or)
|
||||
I_u2u1u3(_ori)
|
@ -0,0 +1,31 @@
|
||||
From: Johan Almbladh <johan.almbladh@anyfinetworks.com>
|
||||
Date: Tue, 5 Oct 2021 18:54:03 +0200
|
||||
Subject: [PATCH] mips: uasm: Add workaround for Loongson-2F nop CPU errata
|
||||
|
||||
This patch implements a workaround for the Loongson-2F nop in generated,
|
||||
code, if the existing option CONFIG_CPU_NOP_WORKAROUND is set. Before,
|
||||
the binutils option -mfix-loongson2f-nop was enabled, but no workaround
|
||||
was done when emitting MIPS code. Now, the nop pseudo instruction is
|
||||
emitted as "or ax,ax,zero" instead of the default "sll zero,zero,0". This
|
||||
is consistent with the workaround implemented by binutils.
|
||||
|
||||
Link: https://sourceware.org/legacy-ml/binutils/2009-11/msg00387.html
|
||||
|
||||
Signed-off-by: Johan Almbladh <johan.almbladh@anyfinetworks.com>
|
||||
Reviewed-by: Jiaxun Yang <jiaxun.yang@flygoat.com>
|
||||
---
|
||||
|
||||
--- a/arch/mips/include/asm/uasm.h
|
||||
+++ b/arch/mips/include/asm/uasm.h
|
||||
@@ -249,7 +249,11 @@ static inline void uasm_l##lb(struct uas
|
||||
#define uasm_i_bnezl(buf, rs, off) uasm_i_bnel(buf, rs, 0, off)
|
||||
#define uasm_i_ehb(buf) uasm_i_sll(buf, 0, 0, 3)
|
||||
#define uasm_i_move(buf, a, b) UASM_i_ADDU(buf, a, 0, b)
|
||||
+#ifdef CONFIG_CPU_NOP_WORKAROUNDS
|
||||
+#define uasm_i_nop(buf) uasm_i_or(buf, 1, 1, 0)
|
||||
+#else
|
||||
#define uasm_i_nop(buf) uasm_i_sll(buf, 0, 0, 0)
|
||||
+#endif
|
||||
#define uasm_i_ssnop(buf) uasm_i_sll(buf, 0, 0, 1)
|
||||
|
||||
static inline void uasm_i_drotr_safe(u32 **p, unsigned int a1,
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,120 @@
|
||||
From: Johan Almbladh <johan.almbladh@anyfinetworks.com>
|
||||
Date: Tue, 5 Oct 2021 18:54:06 +0200
|
||||
Subject: [PATCH] mips: bpf: Add JIT workarounds for CPU errata
|
||||
|
||||
This patch adds workarounds for the following CPU errata to the MIPS
|
||||
eBPF JIT, if enabled in the kernel configuration.
|
||||
|
||||
- R10000 ll/sc weak ordering
|
||||
- Loongson-3 ll/sc weak ordering
|
||||
- Loongson-2F jump hang
|
||||
|
||||
The Loongson-2F nop errata is implemented in uasm, which the JIT uses,
|
||||
so no additional mitigations are needed for that.
|
||||
|
||||
Signed-off-by: Johan Almbladh <johan.almbladh@anyfinetworks.com>
|
||||
Reviewed-by: Jiaxun Yang <jiaxun.yang@flygoat.com>
|
||||
---
|
||||
|
||||
--- a/arch/mips/net/bpf_jit_comp.c
|
||||
+++ b/arch/mips/net/bpf_jit_comp.c
|
||||
@@ -404,6 +404,7 @@ void emit_alu_r(struct jit_context *ctx,
|
||||
/* Atomic read-modify-write (32-bit) */
|
||||
void emit_atomic_r(struct jit_context *ctx, u8 dst, u8 src, s16 off, u8 code)
|
||||
{
|
||||
+ LLSC_sync(ctx);
|
||||
emit(ctx, ll, MIPS_R_T9, off, dst);
|
||||
switch (code) {
|
||||
case BPF_ADD:
|
||||
@@ -420,18 +421,19 @@ void emit_atomic_r(struct jit_context *c
|
||||
break;
|
||||
}
|
||||
emit(ctx, sc, MIPS_R_T8, off, dst);
|
||||
- emit(ctx, beqz, MIPS_R_T8, -16);
|
||||
+ emit(ctx, LLSC_beqz, MIPS_R_T8, -16 - LLSC_offset);
|
||||
emit(ctx, nop); /* Delay slot */
|
||||
}
|
||||
|
||||
/* Atomic compare-and-exchange (32-bit) */
|
||||
void emit_cmpxchg_r(struct jit_context *ctx, u8 dst, u8 src, u8 res, s16 off)
|
||||
{
|
||||
+ LLSC_sync(ctx);
|
||||
emit(ctx, ll, MIPS_R_T9, off, dst);
|
||||
emit(ctx, bne, MIPS_R_T9, res, 12);
|
||||
emit(ctx, move, MIPS_R_T8, src); /* Delay slot */
|
||||
emit(ctx, sc, MIPS_R_T8, off, dst);
|
||||
- emit(ctx, beqz, MIPS_R_T8, -20);
|
||||
+ emit(ctx, LLSC_beqz, MIPS_R_T8, -20 - LLSC_offset);
|
||||
emit(ctx, move, res, MIPS_R_T9); /* Delay slot */
|
||||
clobber_reg(ctx, res);
|
||||
}
|
||||
--- a/arch/mips/net/bpf_jit_comp.h
|
||||
+++ b/arch/mips/net/bpf_jit_comp.h
|
||||
@@ -87,7 +87,7 @@ struct jit_context {
|
||||
};
|
||||
|
||||
/* Emit the instruction if the JIT memory space has been allocated */
|
||||
-#define emit(ctx, func, ...) \
|
||||
+#define __emit(ctx, func, ...) \
|
||||
do { \
|
||||
if ((ctx)->target != NULL) { \
|
||||
u32 *p = &(ctx)->target[ctx->jit_index]; \
|
||||
@@ -95,6 +95,30 @@ do { \
|
||||
} \
|
||||
(ctx)->jit_index++; \
|
||||
} while (0)
|
||||
+#define emit(...) __emit(__VA_ARGS__)
|
||||
+
|
||||
+/* Workaround for R10000 ll/sc errata */
|
||||
+#ifdef CONFIG_WAR_R10000
|
||||
+#define LLSC_beqz beqzl
|
||||
+#else
|
||||
+#define LLSC_beqz beqz
|
||||
+#endif
|
||||
+
|
||||
+/* Workaround for Loongson-3 ll/sc errata */
|
||||
+#ifdef CONFIG_CPU_LOONGSON3_WORKAROUNDS
|
||||
+#define LLSC_sync(ctx) emit(ctx, sync, 0)
|
||||
+#define LLSC_offset 4
|
||||
+#else
|
||||
+#define LLSC_sync(ctx)
|
||||
+#define LLSC_offset 0
|
||||
+#endif
|
||||
+
|
||||
+/* Workaround for Loongson-2F jump errata */
|
||||
+#ifdef CONFIG_CPU_JUMP_WORKAROUNDS
|
||||
+#define JALR_MASK 0xffffffffcfffffffULL
|
||||
+#else
|
||||
+#define JALR_MASK (~0ULL)
|
||||
+#endif
|
||||
|
||||
/*
|
||||
* Mark a BPF register as accessed, it needs to be
|
||||
--- a/arch/mips/net/bpf_jit_comp64.c
|
||||
+++ b/arch/mips/net/bpf_jit_comp64.c
|
||||
@@ -375,6 +375,7 @@ static void emit_atomic_r64(struct jit_c
|
||||
u8 t1 = MIPS_R_T6;
|
||||
u8 t2 = MIPS_R_T7;
|
||||
|
||||
+ LLSC_sync(ctx);
|
||||
emit(ctx, lld, t1, off, dst);
|
||||
switch (code) {
|
||||
case BPF_ADD:
|
||||
@@ -391,7 +392,7 @@ static void emit_atomic_r64(struct jit_c
|
||||
break;
|
||||
}
|
||||
emit(ctx, scd, t2, off, dst);
|
||||
- emit(ctx, beqz, t2, -16);
|
||||
+ emit(ctx, LLSC_beqz, t2, -16 - LLSC_offset);
|
||||
emit(ctx, nop); /* Delay slot */
|
||||
}
|
||||
|
||||
@@ -414,7 +415,7 @@ static int emit_call(struct jit_context
|
||||
push_regs(ctx, ctx->clobbered & JIT_CALLER_REGS, 0, 0);
|
||||
|
||||
/* Emit function call */
|
||||
- emit_mov_i64(ctx, tmp, addr);
|
||||
+ emit_mov_i64(ctx, tmp, addr & JALR_MASK);
|
||||
emit(ctx, jalr, MIPS_R_RA, tmp);
|
||||
emit(ctx, nop); /* Delay slot */
|
||||
|
@ -0,0 +1,61 @@
|
||||
From: Johan Almbladh <johan.almbladh@anyfinetworks.com>
|
||||
Date: Tue, 5 Oct 2021 18:54:07 +0200
|
||||
Subject: [PATCH] mips: bpf: Enable eBPF JITs
|
||||
|
||||
This patch enables the new eBPF JITs for 32-bit and 64-bit MIPS. It also
|
||||
disables the old cBPF JIT to so cBPF programs are converted to use the
|
||||
new JIT.
|
||||
|
||||
Workarounds for R4000 CPU errata are not implemented by the JIT, so the
|
||||
JIT is disabled if any of those workarounds are configured.
|
||||
|
||||
Signed-off-by: Johan Almbladh <johan.almbladh@anyfinetworks.com>
|
||||
---
|
||||
|
||||
--- a/MAINTAINERS
|
||||
+++ b/MAINTAINERS
|
||||
@@ -3431,6 +3431,7 @@ S: Supported
|
||||
F: arch/arm64/net/
|
||||
|
||||
BPF JIT for MIPS (32-BIT AND 64-BIT)
|
||||
+M: Johan Almbladh <johan.almbladh@anyfinetworks.com>
|
||||
M: Paul Burton <paulburton@kernel.org>
|
||||
L: netdev@vger.kernel.org
|
||||
L: bpf@vger.kernel.org
|
||||
--- a/arch/mips/Kconfig
|
||||
+++ b/arch/mips/Kconfig
|
||||
@@ -57,7 +57,6 @@ config MIPS
|
||||
select HAVE_ARCH_TRACEHOOK
|
||||
select HAVE_ARCH_TRANSPARENT_HUGEPAGE if CPU_SUPPORTS_HUGEPAGES
|
||||
select HAVE_ASM_MODVERSIONS
|
||||
- select HAVE_CBPF_JIT if !64BIT && !CPU_MICROMIPS
|
||||
select HAVE_CONTEXT_TRACKING
|
||||
select HAVE_TIF_NOHZ
|
||||
select HAVE_C_RECORDMCOUNT
|
||||
@@ -65,7 +64,10 @@ config MIPS
|
||||
select HAVE_DEBUG_STACKOVERFLOW
|
||||
select HAVE_DMA_CONTIGUOUS
|
||||
select HAVE_DYNAMIC_FTRACE
|
||||
- select HAVE_EBPF_JIT if 64BIT && !CPU_MICROMIPS && TARGET_ISA_REV >= 2
|
||||
+ select HAVE_EBPF_JIT if !CPU_MICROMIPS && \
|
||||
+ !CPU_DADDI_WORKAROUNDS && \
|
||||
+ !CPU_R4000_WORKAROUNDS && \
|
||||
+ !CPU_R4400_WORKAROUNDS
|
||||
select HAVE_EXIT_THREAD
|
||||
select HAVE_FAST_GUP
|
||||
select HAVE_FTRACE_MCOUNT_RECORD
|
||||
--- a/arch/mips/net/Makefile
|
||||
+++ b/arch/mips/net/Makefile
|
||||
@@ -2,9 +2,10 @@
|
||||
# MIPS networking code
|
||||
|
||||
obj-$(CONFIG_MIPS_CBPF_JIT) += bpf_jit.o bpf_jit_asm.o
|
||||
+obj-$(CONFIG_MIPS_EBPF_JIT) += bpf_jit_comp.o
|
||||
|
||||
ifeq ($(CONFIG_32BIT),y)
|
||||
- obj-$(CONFIG_MIPS_EBPF_JIT) += bpf_jit_comp.o bpf_jit_comp32.o
|
||||
+ obj-$(CONFIG_MIPS_EBPF_JIT) += bpf_jit_comp32.o
|
||||
else
|
||||
- obj-$(CONFIG_MIPS_EBPF_JIT) += ebpf_jit.o
|
||||
+ obj-$(CONFIG_MIPS_EBPF_JIT) += bpf_jit_comp64.o
|
||||
endif
|
@ -0,0 +1,387 @@
|
||||
From: Johan Almbladh <johan.almbladh@anyfinetworks.com>
|
||||
Date: Tue, 5 Oct 2021 18:54:08 +0200
|
||||
Subject: [PATCH] mips: bpf: Remove old BPF JIT implementations
|
||||
|
||||
This patch removes the old 32-bit cBPF and 64-bit eBPF JIT implementations.
|
||||
They are replaced by a new eBPF implementation that supports both 32-bit
|
||||
and 64-bit MIPS CPUs.
|
||||
|
||||
Signed-off-by: Johan Almbladh <johan.almbladh@anyfinetworks.com>
|
||||
---
|
||||
delete mode 100644 arch/mips/net/bpf_jit.c
|
||||
delete mode 100644 arch/mips/net/bpf_jit.h
|
||||
delete mode 100644 arch/mips/net/bpf_jit_asm.S
|
||||
delete mode 100644 arch/mips/net/ebpf_jit.c
|
||||
|
||||
--- a/arch/mips/net/bpf_jit.h
|
||||
+++ /dev/null
|
||||
@@ -1,81 +0,0 @@
|
||||
-/* SPDX-License-Identifier: GPL-2.0-only */
|
||||
-/*
|
||||
- * Just-In-Time compiler for BPF filters on MIPS
|
||||
- *
|
||||
- * Copyright (c) 2014 Imagination Technologies Ltd.
|
||||
- * Author: Markos Chandras <markos.chandras@imgtec.com>
|
||||
- */
|
||||
-
|
||||
-#ifndef BPF_JIT_MIPS_OP_H
|
||||
-#define BPF_JIT_MIPS_OP_H
|
||||
-
|
||||
-/* Registers used by JIT */
|
||||
-#define MIPS_R_ZERO 0
|
||||
-#define MIPS_R_V0 2
|
||||
-#define MIPS_R_A0 4
|
||||
-#define MIPS_R_A1 5
|
||||
-#define MIPS_R_T4 12
|
||||
-#define MIPS_R_T5 13
|
||||
-#define MIPS_R_T6 14
|
||||
-#define MIPS_R_T7 15
|
||||
-#define MIPS_R_S0 16
|
||||
-#define MIPS_R_S1 17
|
||||
-#define MIPS_R_S2 18
|
||||
-#define MIPS_R_S3 19
|
||||
-#define MIPS_R_S4 20
|
||||
-#define MIPS_R_S5 21
|
||||
-#define MIPS_R_S6 22
|
||||
-#define MIPS_R_S7 23
|
||||
-#define MIPS_R_SP 29
|
||||
-#define MIPS_R_RA 31
|
||||
-
|
||||
-/* Conditional codes */
|
||||
-#define MIPS_COND_EQ 0x1
|
||||
-#define MIPS_COND_GE (0x1 << 1)
|
||||
-#define MIPS_COND_GT (0x1 << 2)
|
||||
-#define MIPS_COND_NE (0x1 << 3)
|
||||
-#define MIPS_COND_ALL (0x1 << 4)
|
||||
-/* Conditionals on X register or K immediate */
|
||||
-#define MIPS_COND_X (0x1 << 5)
|
||||
-#define MIPS_COND_K (0x1 << 6)
|
||||
-
|
||||
-#define r_ret MIPS_R_V0
|
||||
-
|
||||
-/*
|
||||
- * Use 2 scratch registers to avoid pipeline interlocks.
|
||||
- * There is no overhead during epilogue and prologue since
|
||||
- * any of the $s0-$s6 registers will only be preserved if
|
||||
- * they are going to actually be used.
|
||||
- */
|
||||
-#define r_skb_hl MIPS_R_S0 /* skb header length */
|
||||
-#define r_skb_data MIPS_R_S1 /* skb actual data */
|
||||
-#define r_off MIPS_R_S2
|
||||
-#define r_A MIPS_R_S3
|
||||
-#define r_X MIPS_R_S4
|
||||
-#define r_skb MIPS_R_S5
|
||||
-#define r_M MIPS_R_S6
|
||||
-#define r_skb_len MIPS_R_S7
|
||||
-#define r_s0 MIPS_R_T4 /* scratch reg 1 */
|
||||
-#define r_s1 MIPS_R_T5 /* scratch reg 2 */
|
||||
-#define r_tmp_imm MIPS_R_T6 /* No need to preserve this */
|
||||
-#define r_tmp MIPS_R_T7 /* No need to preserve this */
|
||||
-#define r_zero MIPS_R_ZERO
|
||||
-#define r_sp MIPS_R_SP
|
||||
-#define r_ra MIPS_R_RA
|
||||
-
|
||||
-#ifndef __ASSEMBLY__
|
||||
-
|
||||
-/* Declare ASM helpers */
|
||||
-
|
||||
-#define DECLARE_LOAD_FUNC(func) \
|
||||
- extern u8 func(unsigned long *skb, int offset); \
|
||||
- extern u8 func##_negative(unsigned long *skb, int offset); \
|
||||
- extern u8 func##_positive(unsigned long *skb, int offset)
|
||||
-
|
||||
-DECLARE_LOAD_FUNC(sk_load_word);
|
||||
-DECLARE_LOAD_FUNC(sk_load_half);
|
||||
-DECLARE_LOAD_FUNC(sk_load_byte);
|
||||
-
|
||||
-#endif
|
||||
-
|
||||
-#endif /* BPF_JIT_MIPS_OP_H */
|
||||
--- a/arch/mips/net/bpf_jit_asm.S
|
||||
+++ /dev/null
|
||||
@@ -1,285 +0,0 @@
|
||||
-/*
|
||||
- * bpf_jib_asm.S: Packet/header access helper functions for MIPS/MIPS64 BPF
|
||||
- * compiler.
|
||||
- *
|
||||
- * Copyright (C) 2015 Imagination Technologies Ltd.
|
||||
- * Author: Markos Chandras <markos.chandras@imgtec.com>
|
||||
- *
|
||||
- * This program is free software; you can redistribute it and/or modify it
|
||||
- * under the terms of the GNU General Public License as published by the
|
||||
- * Free Software Foundation; version 2 of the License.
|
||||
- */
|
||||
-
|
||||
-#include <asm/asm.h>
|
||||
-#include <asm/isa-rev.h>
|
||||
-#include <asm/regdef.h>
|
||||
-#include "bpf_jit.h"
|
||||
-
|
||||
-/* ABI
|
||||
- *
|
||||
- * r_skb_hl skb header length
|
||||
- * r_skb_data skb data
|
||||
- * r_off(a1) offset register
|
||||
- * r_A BPF register A
|
||||
- * r_X PF register X
|
||||
- * r_skb(a0) *skb
|
||||
- * r_M *scratch memory
|
||||
- * r_skb_le skb length
|
||||
- * r_s0 Scratch register 0
|
||||
- * r_s1 Scratch register 1
|
||||
- *
|
||||
- * On entry:
|
||||
- * a0: *skb
|
||||
- * a1: offset (imm or imm + X)
|
||||
- *
|
||||
- * All non-BPF-ABI registers are free for use. On return, we only
|
||||
- * care about r_ret. The BPF-ABI registers are assumed to remain
|
||||
- * unmodified during the entire filter operation.
|
||||
- */
|
||||
-
|
||||
-#define skb a0
|
||||
-#define offset a1
|
||||
-#define SKF_LL_OFF (-0x200000) /* Can't include linux/filter.h in assembly */
|
||||
-
|
||||
- /* We know better :) so prevent assembler reordering etc */
|
||||
- .set noreorder
|
||||
-
|
||||
-#define is_offset_negative(TYPE) \
|
||||
- /* If offset is negative we have more work to do */ \
|
||||
- slti t0, offset, 0; \
|
||||
- bgtz t0, bpf_slow_path_##TYPE##_neg; \
|
||||
- /* Be careful what follows in DS. */
|
||||
-
|
||||
-#define is_offset_in_header(SIZE, TYPE) \
|
||||
- /* Reading from header? */ \
|
||||
- addiu $r_s0, $r_skb_hl, -SIZE; \
|
||||
- slt t0, $r_s0, offset; \
|
||||
- bgtz t0, bpf_slow_path_##TYPE; \
|
||||
-
|
||||
-LEAF(sk_load_word)
|
||||
- is_offset_negative(word)
|
||||
-FEXPORT(sk_load_word_positive)
|
||||
- is_offset_in_header(4, word)
|
||||
- /* Offset within header boundaries */
|
||||
- PTR_ADDU t1, $r_skb_data, offset
|
||||
- .set reorder
|
||||
- lw $r_A, 0(t1)
|
||||
- .set noreorder
|
||||
-#ifdef CONFIG_CPU_LITTLE_ENDIAN
|
||||
-# if MIPS_ISA_REV >= 2
|
||||
- wsbh t0, $r_A
|
||||
- rotr $r_A, t0, 16
|
||||
-# else
|
||||
- sll t0, $r_A, 24
|
||||
- srl t1, $r_A, 24
|
||||
- srl t2, $r_A, 8
|
||||
- or t0, t0, t1
|
||||
- andi t2, t2, 0xff00
|
||||
- andi t1, $r_A, 0xff00
|
||||
- or t0, t0, t2
|
||||
- sll t1, t1, 8
|
||||
- or $r_A, t0, t1
|
||||
-# endif
|
||||
-#endif
|
||||
- jr $r_ra
|
||||
- move $r_ret, zero
|
||||
- END(sk_load_word)
|
||||
-
|
||||
-LEAF(sk_load_half)
|
||||
- is_offset_negative(half)
|
||||
-FEXPORT(sk_load_half_positive)
|
||||
- is_offset_in_header(2, half)
|
||||
- /* Offset within header boundaries */
|
||||
- PTR_ADDU t1, $r_skb_data, offset
|
||||
- lhu $r_A, 0(t1)
|
||||
-#ifdef CONFIG_CPU_LITTLE_ENDIAN
|
||||
-# if MIPS_ISA_REV >= 2
|
||||
- wsbh $r_A, $r_A
|
||||
-# else
|
||||
- sll t0, $r_A, 8
|
||||
- srl t1, $r_A, 8
|
||||
- andi t0, t0, 0xff00
|
||||
- or $r_A, t0, t1
|
||||
-# endif
|
||||
-#endif
|
||||
- jr $r_ra
|
||||
- move $r_ret, zero
|
||||
- END(sk_load_half)
|
||||
-
|
||||
-LEAF(sk_load_byte)
|
||||
- is_offset_negative(byte)
|
||||
-FEXPORT(sk_load_byte_positive)
|
||||
- is_offset_in_header(1, byte)
|
||||
- /* Offset within header boundaries */
|
||||
- PTR_ADDU t1, $r_skb_data, offset
|
||||
- lbu $r_A, 0(t1)
|
||||
- jr $r_ra
|
||||
- move $r_ret, zero
|
||||
- END(sk_load_byte)
|
||||
-
|
||||
-/*
|
||||
- * call skb_copy_bits:
|
||||
- * (prototype in linux/skbuff.h)
|
||||
- *
|
||||
- * int skb_copy_bits(sk_buff *skb, int offset, void *to, int len)
|
||||
- *
|
||||
- * o32 mandates we leave 4 spaces for argument registers in case
|
||||
- * the callee needs to use them. Even though we don't care about
|
||||
- * the argument registers ourselves, we need to allocate that space
|
||||
- * to remain ABI compliant since the callee may want to use that space.
|
||||
- * We also allocate 2 more spaces for $r_ra and our return register (*to).
|
||||
- *
|
||||
- * n64 is a bit different. The *caller* will allocate the space to preserve
|
||||
- * the arguments. So in 64-bit kernels, we allocate the 4-arg space for no
|
||||
- * good reason but it does not matter that much really.
|
||||
- *
|
||||
- * (void *to) is returned in r_s0
|
||||
- *
|
||||
- */
|
||||
-#ifdef CONFIG_CPU_LITTLE_ENDIAN
|
||||
-#define DS_OFFSET(SIZE) (4 * SZREG)
|
||||
-#else
|
||||
-#define DS_OFFSET(SIZE) ((4 * SZREG) + (4 - SIZE))
|
||||
-#endif
|
||||
-#define bpf_slow_path_common(SIZE) \
|
||||
- /* Quick check. Are we within reasonable boundaries? */ \
|
||||
- LONG_ADDIU $r_s1, $r_skb_len, -SIZE; \
|
||||
- sltu $r_s0, offset, $r_s1; \
|
||||
- beqz $r_s0, fault; \
|
||||
- /* Load 4th argument in DS */ \
|
||||
- LONG_ADDIU a3, zero, SIZE; \
|
||||
- PTR_ADDIU $r_sp, $r_sp, -(6 * SZREG); \
|
||||
- PTR_LA t0, skb_copy_bits; \
|
||||
- PTR_S $r_ra, (5 * SZREG)($r_sp); \
|
||||
- /* Assign low slot to a2 */ \
|
||||
- PTR_ADDIU a2, $r_sp, DS_OFFSET(SIZE); \
|
||||
- jalr t0; \
|
||||
- /* Reset our destination slot (DS but it's ok) */ \
|
||||
- INT_S zero, (4 * SZREG)($r_sp); \
|
||||
- /* \
|
||||
- * skb_copy_bits returns 0 on success and -EFAULT \
|
||||
- * on error. Our data live in a2. Do not bother with \
|
||||
- * our data if an error has been returned. \
|
||||
- */ \
|
||||
- /* Restore our frame */ \
|
||||
- PTR_L $r_ra, (5 * SZREG)($r_sp); \
|
||||
- INT_L $r_s0, (4 * SZREG)($r_sp); \
|
||||
- bltz v0, fault; \
|
||||
- PTR_ADDIU $r_sp, $r_sp, 6 * SZREG; \
|
||||
- move $r_ret, zero; \
|
||||
-
|
||||
-NESTED(bpf_slow_path_word, (6 * SZREG), $r_sp)
|
||||
- bpf_slow_path_common(4)
|
||||
-#ifdef CONFIG_CPU_LITTLE_ENDIAN
|
||||
-# if MIPS_ISA_REV >= 2
|
||||
- wsbh t0, $r_s0
|
||||
- jr $r_ra
|
||||
- rotr $r_A, t0, 16
|
||||
-# else
|
||||
- sll t0, $r_s0, 24
|
||||
- srl t1, $r_s0, 24
|
||||
- srl t2, $r_s0, 8
|
||||
- or t0, t0, t1
|
||||
- andi t2, t2, 0xff00
|
||||
- andi t1, $r_s0, 0xff00
|
||||
- or t0, t0, t2
|
||||
- sll t1, t1, 8
|
||||
- jr $r_ra
|
||||
- or $r_A, t0, t1
|
||||
-# endif
|
||||
-#else
|
||||
- jr $r_ra
|
||||
- move $r_A, $r_s0
|
||||
-#endif
|
||||
-
|
||||
- END(bpf_slow_path_word)
|
||||
-
|
||||
-NESTED(bpf_slow_path_half, (6 * SZREG), $r_sp)
|
||||
- bpf_slow_path_common(2)
|
||||
-#ifdef CONFIG_CPU_LITTLE_ENDIAN
|
||||
-# if MIPS_ISA_REV >= 2
|
||||
- jr $r_ra
|
||||
- wsbh $r_A, $r_s0
|
||||
-# else
|
||||
- sll t0, $r_s0, 8
|
||||
- andi t1, $r_s0, 0xff00
|
||||
- andi t0, t0, 0xff00
|
||||
- srl t1, t1, 8
|
||||
- jr $r_ra
|
||||
- or $r_A, t0, t1
|
||||
-# endif
|
||||
-#else
|
||||
- jr $r_ra
|
||||
- move $r_A, $r_s0
|
||||
-#endif
|
||||
-
|
||||
- END(bpf_slow_path_half)
|
||||
-
|
||||
-NESTED(bpf_slow_path_byte, (6 * SZREG), $r_sp)
|
||||
- bpf_slow_path_common(1)
|
||||
- jr $r_ra
|
||||
- move $r_A, $r_s0
|
||||
-
|
||||
- END(bpf_slow_path_byte)
|
||||
-
|
||||
-/*
|
||||
- * Negative entry points
|
||||
- */
|
||||
- .macro bpf_is_end_of_data
|
||||
- li t0, SKF_LL_OFF
|
||||
- /* Reading link layer data? */
|
||||
- slt t1, offset, t0
|
||||
- bgtz t1, fault
|
||||
- /* Be careful what follows in DS. */
|
||||
- .endm
|
||||
-/*
|
||||
- * call skb_copy_bits:
|
||||
- * (prototype in linux/filter.h)
|
||||
- *
|
||||
- * void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb,
|
||||
- * int k, unsigned int size)
|
||||
- *
|
||||
- * see above (bpf_slow_path_common) for ABI restrictions
|
||||
- */
|
||||
-#define bpf_negative_common(SIZE) \
|
||||
- PTR_ADDIU $r_sp, $r_sp, -(6 * SZREG); \
|
||||
- PTR_LA t0, bpf_internal_load_pointer_neg_helper; \
|
||||
- PTR_S $r_ra, (5 * SZREG)($r_sp); \
|
||||
- jalr t0; \
|
||||
- li a2, SIZE; \
|
||||
- PTR_L $r_ra, (5 * SZREG)($r_sp); \
|
||||
- /* Check return pointer */ \
|
||||
- beqz v0, fault; \
|
||||
- PTR_ADDIU $r_sp, $r_sp, 6 * SZREG; \
|
||||
- /* Preserve our pointer */ \
|
||||
- move $r_s0, v0; \
|
||||
- /* Set return value */ \
|
||||
- move $r_ret, zero; \
|
||||
-
|
||||
-bpf_slow_path_word_neg:
|
||||
- bpf_is_end_of_data
|
||||
-NESTED(sk_load_word_negative, (6 * SZREG), $r_sp)
|
||||
- bpf_negative_common(4)
|
||||
- jr $r_ra
|
||||
- lw $r_A, 0($r_s0)
|
||||
- END(sk_load_word_negative)
|
||||
-
|
||||
-bpf_slow_path_half_neg:
|
||||
- bpf_is_end_of_data
|
||||
-NESTED(sk_load_half_negative, (6 * SZREG), $r_sp)
|
||||
- bpf_negative_common(2)
|
||||
- jr $r_ra
|
||||
- lhu $r_A, 0($r_s0)
|
||||
- END(sk_load_half_negative)
|
||||
-
|
||||
-bpf_slow_path_byte_neg:
|
||||
- bpf_is_end_of_data
|
||||
-NESTED(sk_load_byte_negative, (6 * SZREG), $r_sp)
|
||||
- bpf_negative_common(1)
|
||||
- jr $r_ra
|
||||
- lbu $r_A, 0($r_s0)
|
||||
- END(sk_load_byte_negative)
|
||||
-
|
||||
-fault:
|
||||
- jr $r_ra
|
||||
- addiu $r_ret, zero, 1
|
@ -0,0 +1,105 @@
|
||||
From 815f0e738a8d5663a02350e2580706829144a722 Mon Sep 17 00:00:00 2001
|
||||
From: Horatiu Vultur <horatiu.vultur@microchip.com>
|
||||
Date: Wed, 3 Nov 2021 09:50:59 +0100
|
||||
Subject: [PATCH] clk: gate: Add devm_clk_hw_register_gate()
|
||||
|
||||
Add devm_clk_hw_register_gate() - devres-managed version of
|
||||
clk_hw_register_gate()
|
||||
|
||||
Suggested-by: Stephen Boyd <sboyd@kernel.org>
|
||||
Signed-off-by: Horatiu Vultur <horatiu.vultur@microchip.com>
|
||||
Acked-by: Nicolas Ferre <nicolas.ferre@microchip.com>
|
||||
Signed-off-by: Nicolas Ferre <nicolas.ferre@microchip.com>
|
||||
Link: https://lore.kernel.org/r/20211103085102.1656081-2-horatiu.vultur@microchip.com
|
||||
---
|
||||
drivers/clk/clk-gate.c | 35 +++++++++++++++++++++++++++++++++++
|
||||
include/linux/clk-provider.h | 23 +++++++++++++++++++++++
|
||||
2 files changed, 58 insertions(+)
|
||||
|
||||
--- a/drivers/clk/clk-gate.c
|
||||
+++ b/drivers/clk/clk-gate.c
|
||||
@@ -7,6 +7,7 @@
|
||||
*/
|
||||
|
||||
#include <linux/clk-provider.h>
|
||||
+#include <linux/device.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/io.h>
|
||||
@@ -222,3 +223,37 @@ void clk_hw_unregister_gate(struct clk_h
|
||||
kfree(gate);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(clk_hw_unregister_gate);
|
||||
+
|
||||
+static void devm_clk_hw_release_gate(struct device *dev, void *res)
|
||||
+{
|
||||
+ clk_hw_unregister_gate(*(struct clk_hw **)res);
|
||||
+}
|
||||
+
|
||||
+struct clk_hw *__devm_clk_hw_register_gate(struct device *dev,
|
||||
+ struct device_node *np, const char *name,
|
||||
+ const char *parent_name, const struct clk_hw *parent_hw,
|
||||
+ const struct clk_parent_data *parent_data,
|
||||
+ unsigned long flags,
|
||||
+ void __iomem *reg, u8 bit_idx,
|
||||
+ u8 clk_gate_flags, spinlock_t *lock)
|
||||
+{
|
||||
+ struct clk_hw **ptr, *hw;
|
||||
+
|
||||
+ ptr = devres_alloc(devm_clk_hw_release_gate, sizeof(*ptr), GFP_KERNEL);
|
||||
+ if (!ptr)
|
||||
+ return ERR_PTR(-ENOMEM);
|
||||
+
|
||||
+ hw = __clk_hw_register_gate(dev, np, name, parent_name, parent_hw,
|
||||
+ parent_data, flags, reg, bit_idx,
|
||||
+ clk_gate_flags, lock);
|
||||
+
|
||||
+ if (!IS_ERR(hw)) {
|
||||
+ *ptr = hw;
|
||||
+ devres_add(dev, ptr);
|
||||
+ } else {
|
||||
+ devres_free(ptr);
|
||||
+ }
|
||||
+
|
||||
+ return hw;
|
||||
+}
|
||||
+EXPORT_SYMBOL_GPL(__devm_clk_hw_register_gate);
|
||||
--- a/include/linux/clk-provider.h
|
||||
+++ b/include/linux/clk-provider.h
|
||||
@@ -490,6 +490,13 @@ struct clk_hw *__clk_hw_register_gate(st
|
||||
unsigned long flags,
|
||||
void __iomem *reg, u8 bit_idx,
|
||||
u8 clk_gate_flags, spinlock_t *lock);
|
||||
+struct clk_hw *__devm_clk_hw_register_gate(struct device *dev,
|
||||
+ struct device_node *np, const char *name,
|
||||
+ const char *parent_name, const struct clk_hw *parent_hw,
|
||||
+ const struct clk_parent_data *parent_data,
|
||||
+ unsigned long flags,
|
||||
+ void __iomem *reg, u8 bit_idx,
|
||||
+ u8 clk_gate_flags, spinlock_t *lock);
|
||||
struct clk *clk_register_gate(struct device *dev, const char *name,
|
||||
const char *parent_name, unsigned long flags,
|
||||
void __iomem *reg, u8 bit_idx,
|
||||
@@ -544,6 +551,22 @@ struct clk *clk_register_gate(struct dev
|
||||
__clk_hw_register_gate((dev), NULL, (name), NULL, NULL, (parent_data), \
|
||||
(flags), (reg), (bit_idx), \
|
||||
(clk_gate_flags), (lock))
|
||||
+/**
|
||||
+ * devm_clk_hw_register_gate - register a gate clock with the clock framework
|
||||
+ * @dev: device that is registering this clock
|
||||
+ * @name: name of this clock
|
||||
+ * @parent_name: name of this clock's parent
|
||||
+ * @flags: framework-specific flags for this clock
|
||||
+ * @reg: register address to control gating of this clock
|
||||
+ * @bit_idx: which bit in the register controls gating of this clock
|
||||
+ * @clk_gate_flags: gate-specific flags for this clock
|
||||
+ * @lock: shared register lock for this clock
|
||||
+ */
|
||||
+#define devm_clk_hw_register_gate(dev, name, parent_name, flags, reg, bit_idx,\
|
||||
+ clk_gate_flags, lock) \
|
||||
+ __devm_clk_hw_register_gate((dev), NULL, (name), (parent_name), NULL, \
|
||||
+ NULL, (flags), (reg), (bit_idx), \
|
||||
+ (clk_gate_flags), (lock))
|
||||
void clk_unregister_gate(struct clk *clk);
|
||||
void clk_hw_unregister_gate(struct clk_hw *hw);
|
||||
int clk_gate_is_enabled(struct clk_hw *hw);
|
@ -0,0 +1,52 @@
|
||||
From 02d6fdecb9c38de19065f6bed8d5214556fd061d Mon Sep 17 00:00:00 2001
|
||||
From: Ansuel Smith <ansuelsmth@gmail.com>
|
||||
Date: Thu, 4 Nov 2021 16:00:40 +0100
|
||||
Subject: regmap: allow to define reg_update_bits for no bus configuration
|
||||
|
||||
Some device requires a special handling for reg_update_bits and can't use
|
||||
the normal regmap read write logic. An example is when locking is
|
||||
handled by the device and rmw operations requires to do atomic operations.
|
||||
Allow to declare a dedicated function in regmap_config for
|
||||
reg_update_bits in no bus configuration.
|
||||
|
||||
Signed-off-by: Ansuel Smith <ansuelsmth@gmail.com>
|
||||
Link: https://lore.kernel.org/r/20211104150040.1260-1-ansuelsmth@gmail.com
|
||||
Signed-off-by: Mark Brown <broonie@kernel.org>
|
||||
---
|
||||
drivers/base/regmap/regmap.c | 1 +
|
||||
include/linux/regmap.h | 7 +++++++
|
||||
2 files changed, 8 insertions(+)
|
||||
|
||||
--- a/drivers/base/regmap/regmap.c
|
||||
+++ b/drivers/base/regmap/regmap.c
|
||||
@@ -877,6 +877,7 @@ struct regmap *__regmap_init(struct devi
|
||||
if (!bus) {
|
||||
map->reg_read = config->reg_read;
|
||||
map->reg_write = config->reg_write;
|
||||
+ map->reg_update_bits = config->reg_update_bits;
|
||||
|
||||
map->defer_caching = false;
|
||||
goto skip_format_initialization;
|
||||
--- a/include/linux/regmap.h
|
||||
+++ b/include/linux/regmap.h
|
||||
@@ -290,6 +290,11 @@ typedef void (*regmap_unlock)(void *);
|
||||
* read operation on a bus such as SPI, I2C, etc. Most of the
|
||||
* devices do not need this.
|
||||
* @reg_write: Same as above for writing.
|
||||
+ * @reg_update_bits: Optional callback that if filled will be used to perform
|
||||
+ * all the update_bits(rmw) operation. Should only be provided
|
||||
+ * if the function require special handling with lock and reg
|
||||
+ * handling and the operation cannot be represented as a simple
|
||||
+ * update_bits operation on a bus such as SPI, I2C, etc.
|
||||
* @fast_io: Register IO is fast. Use a spinlock instead of a mutex
|
||||
* to perform locking. This field is ignored if custom lock/unlock
|
||||
* functions are used (see fields lock/unlock of struct regmap_config).
|
||||
@@ -372,6 +377,8 @@ struct regmap_config {
|
||||
|
||||
int (*reg_read)(void *context, unsigned int reg, unsigned int *val);
|
||||
int (*reg_write)(void *context, unsigned int reg, unsigned int val);
|
||||
+ int (*reg_update_bits)(void *context, unsigned int reg,
|
||||
+ unsigned int mask, unsigned int val);
|
||||
|
||||
bool fast_io;
|
||||
|
@ -0,0 +1,37 @@
|
||||
From 0dc0da881b4574d1e04a079ab2ea75da61f5ad2e Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= <rafal@milecki.pl>
|
||||
Date: Fri, 11 Mar 2022 10:32:33 +0100
|
||||
Subject: [PATCH] tty: serial: bcm63xx: use more precise Kconfig symbol
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
Patches lowering SERIAL_BCM63XX dependencies led to a discussion and
|
||||
documentation change regarding "depends" usage. Adjust Kconfig entry to
|
||||
match current guidelines. Make this symbol available for relevant
|
||||
architectures only.
|
||||
|
||||
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
|
||||
Reviewed-by: Geert Uytterhoeven <geert+renesas@glider.be>
|
||||
Acked-by: Florian Fainelli <f.fainelli@gmail.com>
|
||||
Signed-off-by: Rafał Miłecki <rafal@milecki.pl>
|
||||
Ref: f35a07f92616 ("tty: serial: bcm63xx: lower driver dependencies")
|
||||
Ref: 18084e435ff6 ("Documentation/kbuild: Document platform dependency practises")
|
||||
Link: https://lore.kernel.org/r/20220311093233.10012-1-zajec5@gmail.com
|
||||
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
|
||||
---
|
||||
drivers/tty/serial/Kconfig | 3 ++-
|
||||
1 file changed, 2 insertions(+), 1 deletion(-)
|
||||
|
||||
--- a/drivers/tty/serial/Kconfig
|
||||
+++ b/drivers/tty/serial/Kconfig
|
||||
@@ -1098,7 +1098,8 @@ config SERIAL_TIMBERDALE
|
||||
config SERIAL_BCM63XX
|
||||
tristate "Broadcom BCM63xx/BCM33xx UART support"
|
||||
select SERIAL_CORE
|
||||
- depends on COMMON_CLK
|
||||
+ depends on ARCH_BCM4908 || ARCH_BCM_63XX || BCM63XX || BMIPS_GENERIC || COMPILE_TEST
|
||||
+ default ARCH_BCM4908 || ARCH_BCM_63XX || BCM63XX || BMIPS_GENERIC
|
||||
help
|
||||
This enables the driver for the onchip UART core found on
|
||||
the following chipsets:
|
@ -0,0 +1,49 @@
|
||||
From cdbc4e3399ed8cdcf234a85f7a2482b622379e82 Mon Sep 17 00:00:00 2001
|
||||
From: Connor O'Brien <connoro@google.com>
|
||||
Date: Wed, 12 Jan 2022 00:25:03 +0000
|
||||
Subject: [PATCH] tools/resolve_btfids: Build with host flags
|
||||
|
||||
resolve_btfids is built using $(HOSTCC) and $(HOSTLD) but does not
|
||||
pick up the corresponding flags. As a result, host-specific settings
|
||||
(such as a sysroot specified via HOSTCFLAGS=--sysroot=..., or a linker
|
||||
specified via HOSTLDFLAGS=-fuse-ld=...) will not be respected.
|
||||
|
||||
Fix this by setting CFLAGS to KBUILD_HOSTCFLAGS and LDFLAGS to
|
||||
KBUILD_HOSTLDFLAGS.
|
||||
|
||||
Also pass the cflags through to libbpf via EXTRA_CFLAGS to ensure that
|
||||
the host libbpf is built with flags consistent with resolve_btfids.
|
||||
|
||||
Signed-off-by: Connor O'Brien <connoro@google.com>
|
||||
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
|
||||
Acked-by: Song Liu <songliubraving@fb.com>
|
||||
Link: https://lore.kernel.org/bpf/20220112002503.115968-1-connoro@google.com
|
||||
(cherry picked from commit 0e3a1c902ffb56e9fe4416f0cd382c97b09ecbf6)
|
||||
Signed-off-by: Stijn Tintel <stijn@linux-ipv6.be>
|
||||
---
|
||||
tools/bpf/resolve_btfids/Makefile | 6 ++++--
|
||||
1 file changed, 4 insertions(+), 2 deletions(-)
|
||||
|
||||
--- a/tools/bpf/resolve_btfids/Makefile
|
||||
+++ b/tools/bpf/resolve_btfids/Makefile
|
||||
@@ -23,6 +23,8 @@ CC = $(HOSTCC)
|
||||
LD = $(HOSTLD)
|
||||
ARCH = $(HOSTARCH)
|
||||
RM ?= rm
|
||||
+CFLAGS := $(KBUILD_HOSTCFLAGS)
|
||||
+LDFLAGS := $(KBUILD_HOSTLDFLAGS)
|
||||
|
||||
OUTPUT ?= $(srctree)/tools/bpf/resolve_btfids/
|
||||
|
||||
@@ -45,9 +47,9 @@ $(SUBCMDOBJ): fixdep FORCE | $(OUTPUT)/l
|
||||
$(Q)$(MAKE) -C $(SUBCMD_SRC) OUTPUT=$(abspath $(dir $@))/ $(abspath $@)
|
||||
|
||||
$(BPFOBJ): $(wildcard $(LIBBPF_SRC)/*.[ch] $(LIBBPF_SRC)/Makefile) | $(OUTPUT)/libbpf
|
||||
- $(Q)$(MAKE) $(submake_extras) -C $(LIBBPF_SRC) OUTPUT=$(abspath $(dir $@))/ $(abspath $@)
|
||||
+ $(Q)$(MAKE) $(submake_extras) -C $(LIBBPF_SRC) OUTPUT=$(abspath $(dir $@))/ EXTRA_CFLAGS="$(CFLAGS)" $(abspath $@)
|
||||
|
||||
-CFLAGS := -g \
|
||||
+CFLAGS += -g \
|
||||
-I$(srctree)/tools/include \
|
||||
-I$(srctree)/tools/include/uapi \
|
||||
-I$(LIBBPF_SRC) \
|
@ -0,0 +1,997 @@
|
||||
From a77725a9a3c5924e2fd4cd5b3557dd92a8e46f87 Mon Sep 17 00:00:00 2001
|
||||
From: Rob Herring <robh@kernel.org>
|
||||
Date: Mon, 25 Oct 2021 11:05:45 -0500
|
||||
Subject: [PATCH 1/1] scripts/dtc: Update to upstream version
|
||||
v1.6.1-19-g0a3a9d3449c8
|
||||
|
||||
This adds the following commits from upstream:
|
||||
|
||||
0a3a9d3449c8 checks: Add an interrupt-map check
|
||||
8fd24744e361 checks: Ensure '#interrupt-cells' only exists in interrupt providers
|
||||
d8d1a9a77863 checks: Drop interrupt provider '#address-cells' check
|
||||
52a16fd72824 checks: Make interrupt_provider check dependent on interrupts_extended_is_cell
|
||||
37fd700685da treesource: Maintain phandle label/path on output
|
||||
e33ce1d6a8c7 flattree: Use '\n', not ';' to separate asm pseudo-ops
|
||||
d24cc189dca6 asm: Use assembler macros instead of cpp macros
|
||||
ff3a30c115ad asm: Use .asciz and .ascii instead of .string
|
||||
5eb5927d81ee fdtdump: fix -Werror=int-to-pointer-cast
|
||||
0869f8269161 libfdt: Add ALIGNMENT error string
|
||||
69595a167f06 checks: Fix bus-range check
|
||||
72d09e2682a4 Makefile: add -Wsign-compare to warning options
|
||||
b587787ef388 checks: Fix signedness comparisons warnings
|
||||
69bed6c2418f dtc: Wrap phandle validity check
|
||||
910221185560 fdtget: Fix signedness comparisons warnings
|
||||
d966f08fcd21 tests: Fix signedness comparisons warnings
|
||||
ecfb438c07fa dtc: Fix signedness comparisons warnings: pointer diff
|
||||
5bec74a6d135 dtc: Fix signedness comparisons warnings: reservednum
|
||||
24e7f511fd4a fdtdump: Fix signedness comparisons warnings
|
||||
b6910bec1161 Bump version to v1.6.1
|
||||
21d61d18f968 Fix CID 1461557
|
||||
4c2ef8f4d14c checks: Introduce is_multiple_of()
|
||||
e59ca36fb70e Make handling of cpp line information more tolerant
|
||||
0c3fd9b6aceb checks: Drop interrupt_cells_is_cell check
|
||||
6b3081abc4ac checks: Add check_is_cell() for all phandle+arg properties
|
||||
2dffc192a77f yamltree: Remove marker ordering dependency
|
||||
61e513439e40 pylibfdt: Rework "avoid unused variable warning" lines
|
||||
c8bddd106095 tests: add a positive gpio test case
|
||||
ad4abfadb687 checks: replace strstr and strrchr with strends
|
||||
09c6a6e88718 dtc.h: add strends for suffix matching
|
||||
9bb9b8d0b4a0 checks: tigthen up nr-gpios prop exception
|
||||
b07b62ee3342 libfdt: Add FDT alignment check to fdt_check_header()
|
||||
a2def5479950 libfdt: Check that the root-node name is empty
|
||||
4ca61f84dc21 libfdt: Check that there is only one root node
|
||||
34d708249a91 dtc: Remove -O dtbo support
|
||||
8e7ff260f755 libfdt: Fix a possible "unchecked return value" warning
|
||||
88875268c05c checks: Warn on node-name and property name being the same
|
||||
9d2279e7e6ee checks: Change node-name check to match devicetree spec
|
||||
f527c867a8c6 util: limit gnu_printf format attribute to gcc >= 4.4.0
|
||||
|
||||
Reviewed-by: Frank Rowand <frank.rowand@sony.com>
|
||||
Tested-by: Frank Rowand <frank.rowand@sony.com>
|
||||
Signed-off-by: Rob Herring <robh@kernel.org>
|
||||
---
|
||||
scripts/dtc/checks.c | 222 ++++++++++++++++++++++--------
|
||||
scripts/dtc/dtc-lexer.l | 2 +-
|
||||
scripts/dtc/dtc.c | 6 +-
|
||||
scripts/dtc/dtc.h | 40 +++++-
|
||||
scripts/dtc/flattree.c | 11 +-
|
||||
scripts/dtc/libfdt/fdt.c | 4 +
|
||||
scripts/dtc/libfdt/fdt_rw.c | 18 ++-
|
||||
scripts/dtc/libfdt/fdt_strerror.c | 1 +
|
||||
scripts/dtc/libfdt/libfdt.h | 7 +
|
||||
scripts/dtc/livetree.c | 6 +-
|
||||
scripts/dtc/treesource.c | 48 +++----
|
||||
scripts/dtc/util.h | 6 +-
|
||||
scripts/dtc/version_gen.h | 2 +-
|
||||
scripts/dtc/yamltree.c | 16 ++-
|
||||
14 files changed, 275 insertions(+), 114 deletions(-)
|
||||
|
||||
--- a/scripts/dtc/checks.c
|
||||
+++ b/scripts/dtc/checks.c
|
||||
@@ -143,6 +143,14 @@ static void check_nodes_props(struct che
|
||||
check_nodes_props(c, dti, child);
|
||||
}
|
||||
|
||||
+static bool is_multiple_of(int multiple, int divisor)
|
||||
+{
|
||||
+ if (divisor == 0)
|
||||
+ return multiple == 0;
|
||||
+ else
|
||||
+ return (multiple % divisor) == 0;
|
||||
+}
|
||||
+
|
||||
static bool run_check(struct check *c, struct dt_info *dti)
|
||||
{
|
||||
struct node *dt = dti->dt;
|
||||
@@ -297,19 +305,20 @@ ERROR(duplicate_property_names, check_du
|
||||
#define LOWERCASE "abcdefghijklmnopqrstuvwxyz"
|
||||
#define UPPERCASE "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
|
||||
#define DIGITS "0123456789"
|
||||
-#define PROPNODECHARS LOWERCASE UPPERCASE DIGITS ",._+*#?-"
|
||||
+#define NODECHARS LOWERCASE UPPERCASE DIGITS ",._+-@"
|
||||
+#define PROPCHARS LOWERCASE UPPERCASE DIGITS ",._+*#?-"
|
||||
#define PROPNODECHARSSTRICT LOWERCASE UPPERCASE DIGITS ",-"
|
||||
|
||||
static void check_node_name_chars(struct check *c, struct dt_info *dti,
|
||||
struct node *node)
|
||||
{
|
||||
- int n = strspn(node->name, c->data);
|
||||
+ size_t n = strspn(node->name, c->data);
|
||||
|
||||
if (n < strlen(node->name))
|
||||
FAIL(c, dti, node, "Bad character '%c' in node name",
|
||||
node->name[n]);
|
||||
}
|
||||
-ERROR(node_name_chars, check_node_name_chars, PROPNODECHARS "@");
|
||||
+ERROR(node_name_chars, check_node_name_chars, NODECHARS);
|
||||
|
||||
static void check_node_name_chars_strict(struct check *c, struct dt_info *dti,
|
||||
struct node *node)
|
||||
@@ -330,6 +339,20 @@ static void check_node_name_format(struc
|
||||
}
|
||||
ERROR(node_name_format, check_node_name_format, NULL, &node_name_chars);
|
||||
|
||||
+static void check_node_name_vs_property_name(struct check *c,
|
||||
+ struct dt_info *dti,
|
||||
+ struct node *node)
|
||||
+{
|
||||
+ if (!node->parent)
|
||||
+ return;
|
||||
+
|
||||
+ if (get_property(node->parent, node->name)) {
|
||||
+ FAIL(c, dti, node, "node name and property name conflict");
|
||||
+ }
|
||||
+}
|
||||
+WARNING(node_name_vs_property_name, check_node_name_vs_property_name,
|
||||
+ NULL, &node_name_chars);
|
||||
+
|
||||
static void check_unit_address_vs_reg(struct check *c, struct dt_info *dti,
|
||||
struct node *node)
|
||||
{
|
||||
@@ -363,14 +386,14 @@ static void check_property_name_chars(st
|
||||
struct property *prop;
|
||||
|
||||
for_each_property(node, prop) {
|
||||
- int n = strspn(prop->name, c->data);
|
||||
+ size_t n = strspn(prop->name, c->data);
|
||||
|
||||
if (n < strlen(prop->name))
|
||||
FAIL_PROP(c, dti, node, prop, "Bad character '%c' in property name",
|
||||
prop->name[n]);
|
||||
}
|
||||
}
|
||||
-ERROR(property_name_chars, check_property_name_chars, PROPNODECHARS);
|
||||
+ERROR(property_name_chars, check_property_name_chars, PROPCHARS);
|
||||
|
||||
static void check_property_name_chars_strict(struct check *c,
|
||||
struct dt_info *dti,
|
||||
@@ -380,7 +403,7 @@ static void check_property_name_chars_st
|
||||
|
||||
for_each_property(node, prop) {
|
||||
const char *name = prop->name;
|
||||
- int n = strspn(name, c->data);
|
||||
+ size_t n = strspn(name, c->data);
|
||||
|
||||
if (n == strlen(prop->name))
|
||||
continue;
|
||||
@@ -497,7 +520,7 @@ static cell_t check_phandle_prop(struct
|
||||
|
||||
phandle = propval_cell(prop);
|
||||
|
||||
- if ((phandle == 0) || (phandle == -1)) {
|
||||
+ if (!phandle_is_valid(phandle)) {
|
||||
FAIL_PROP(c, dti, node, prop, "bad value (0x%x) in %s property",
|
||||
phandle, prop->name);
|
||||
return 0;
|
||||
@@ -556,7 +579,7 @@ static void check_name_properties(struct
|
||||
if (!prop)
|
||||
return; /* No name property, that's fine */
|
||||
|
||||
- if ((prop->val.len != node->basenamelen+1)
|
||||
+ if ((prop->val.len != node->basenamelen + 1U)
|
||||
|| (memcmp(prop->val.val, node->name, node->basenamelen) != 0)) {
|
||||
FAIL(c, dti, node, "\"name\" property is incorrect (\"%s\" instead"
|
||||
" of base node name)", prop->val.val);
|
||||
@@ -657,7 +680,6 @@ ERROR(omit_unused_nodes, fixup_omit_unus
|
||||
*/
|
||||
WARNING_IF_NOT_CELL(address_cells_is_cell, "#address-cells");
|
||||
WARNING_IF_NOT_CELL(size_cells_is_cell, "#size-cells");
|
||||
-WARNING_IF_NOT_CELL(interrupt_cells_is_cell, "#interrupt-cells");
|
||||
|
||||
WARNING_IF_NOT_STRING(device_type_is_string, "device_type");
|
||||
WARNING_IF_NOT_STRING(model_is_string, "model");
|
||||
@@ -672,8 +694,7 @@ static void check_names_is_string_list(s
|
||||
struct property *prop;
|
||||
|
||||
for_each_property(node, prop) {
|
||||
- const char *s = strrchr(prop->name, '-');
|
||||
- if (!s || !streq(s, "-names"))
|
||||
+ if (!strends(prop->name, "-names"))
|
||||
continue;
|
||||
|
||||
c->data = prop->name;
|
||||
@@ -753,7 +774,7 @@ static void check_reg_format(struct chec
|
||||
size_cells = node_size_cells(node->parent);
|
||||
entrylen = (addr_cells + size_cells) * sizeof(cell_t);
|
||||
|
||||
- if (!entrylen || (prop->val.len % entrylen) != 0)
|
||||
+ if (!is_multiple_of(prop->val.len, entrylen))
|
||||
FAIL_PROP(c, dti, node, prop, "property has invalid length (%d bytes) "
|
||||
"(#address-cells == %d, #size-cells == %d)",
|
||||
prop->val.len, addr_cells, size_cells);
|
||||
@@ -794,7 +815,7 @@ static void check_ranges_format(struct c
|
||||
"#size-cells (%d) differs from %s (%d)",
|
||||
ranges, c_size_cells, node->parent->fullpath,
|
||||
p_size_cells);
|
||||
- } else if ((prop->val.len % entrylen) != 0) {
|
||||
+ } else if (!is_multiple_of(prop->val.len, entrylen)) {
|
||||
FAIL_PROP(c, dti, node, prop, "\"%s\" property has invalid length (%d bytes) "
|
||||
"(parent #address-cells == %d, child #address-cells == %d, "
|
||||
"#size-cells == %d)", ranges, prop->val.len,
|
||||
@@ -871,7 +892,7 @@ static void check_pci_device_bus_num(str
|
||||
} else {
|
||||
cells = (cell_t *)prop->val.val;
|
||||
min_bus = fdt32_to_cpu(cells[0]);
|
||||
- max_bus = fdt32_to_cpu(cells[0]);
|
||||
+ max_bus = fdt32_to_cpu(cells[1]);
|
||||
}
|
||||
if ((bus_num < min_bus) || (bus_num > max_bus))
|
||||
FAIL_PROP(c, dti, node, prop, "PCI bus number %d out of range, expected (%d - %d)",
|
||||
@@ -1367,9 +1388,9 @@ static void check_property_phandle_args(
|
||||
const struct provider *provider)
|
||||
{
|
||||
struct node *root = dti->dt;
|
||||
- int cell, cellsize = 0;
|
||||
+ unsigned int cell, cellsize = 0;
|
||||
|
||||
- if (prop->val.len % sizeof(cell_t)) {
|
||||
+ if (!is_multiple_of(prop->val.len, sizeof(cell_t))) {
|
||||
FAIL_PROP(c, dti, node, prop,
|
||||
"property size (%d) is invalid, expected multiple of %zu",
|
||||
prop->val.len, sizeof(cell_t));
|
||||
@@ -1379,14 +1400,14 @@ static void check_property_phandle_args(
|
||||
for (cell = 0; cell < prop->val.len / sizeof(cell_t); cell += cellsize + 1) {
|
||||
struct node *provider_node;
|
||||
struct property *cellprop;
|
||||
- int phandle;
|
||||
+ cell_t phandle;
|
||||
|
||||
phandle = propval_cell_n(prop, cell);
|
||||
/*
|
||||
* Some bindings use a cell value 0 or -1 to skip over optional
|
||||
* entries when each index position has a specific definition.
|
||||
*/
|
||||
- if (phandle == 0 || phandle == -1) {
|
||||
+ if (!phandle_is_valid(phandle)) {
|
||||
/* Give up if this is an overlay with external references */
|
||||
if (dti->dtsflags & DTSF_PLUGIN)
|
||||
break;
|
||||
@@ -1452,7 +1473,8 @@ static void check_provider_cells_propert
|
||||
}
|
||||
#define WARNING_PROPERTY_PHANDLE_CELLS(nm, propname, cells_name, ...) \
|
||||
static struct provider nm##_provider = { (propname), (cells_name), __VA_ARGS__ }; \
|
||||
- WARNING(nm##_property, check_provider_cells_property, &nm##_provider, &phandle_references);
|
||||
+ WARNING_IF_NOT_CELL(nm##_is_cell, cells_name); \
|
||||
+ WARNING(nm##_property, check_provider_cells_property, &nm##_provider, &nm##_is_cell, &phandle_references);
|
||||
|
||||
WARNING_PROPERTY_PHANDLE_CELLS(clocks, "clocks", "#clock-cells");
|
||||
WARNING_PROPERTY_PHANDLE_CELLS(cooling_device, "cooling-device", "#cooling-cells");
|
||||
@@ -1473,24 +1495,17 @@ WARNING_PROPERTY_PHANDLE_CELLS(thermal_s
|
||||
|
||||
static bool prop_is_gpio(struct property *prop)
|
||||
{
|
||||
- char *str;
|
||||
-
|
||||
/*
|
||||
* *-gpios and *-gpio can appear in property names,
|
||||
* so skip over any false matches (only one known ATM)
|
||||
*/
|
||||
- if (strstr(prop->name, "nr-gpio"))
|
||||
+ if (strends(prop->name, ",nr-gpios"))
|
||||
return false;
|
||||
|
||||
- str = strrchr(prop->name, '-');
|
||||
- if (str)
|
||||
- str++;
|
||||
- else
|
||||
- str = prop->name;
|
||||
- if (!(streq(str, "gpios") || streq(str, "gpio")))
|
||||
- return false;
|
||||
-
|
||||
- return true;
|
||||
+ return strends(prop->name, "-gpios") ||
|
||||
+ streq(prop->name, "gpios") ||
|
||||
+ strends(prop->name, "-gpio") ||
|
||||
+ streq(prop->name, "gpio");
|
||||
}
|
||||
|
||||
static void check_gpios_property(struct check *c,
|
||||
@@ -1525,13 +1540,10 @@ static void check_deprecated_gpio_proper
|
||||
struct property *prop;
|
||||
|
||||
for_each_property(node, prop) {
|
||||
- char *str;
|
||||
-
|
||||
if (!prop_is_gpio(prop))
|
||||
continue;
|
||||
|
||||
- str = strstr(prop->name, "gpio");
|
||||
- if (!streq(str, "gpio"))
|
||||
+ if (!strends(prop->name, "gpio"))
|
||||
continue;
|
||||
|
||||
FAIL_PROP(c, dti, node, prop,
|
||||
@@ -1561,21 +1573,106 @@ static void check_interrupt_provider(str
|
||||
struct node *node)
|
||||
{
|
||||
struct property *prop;
|
||||
+ bool irq_provider = node_is_interrupt_provider(node);
|
||||
|
||||
- if (!node_is_interrupt_provider(node))
|
||||
+ prop = get_property(node, "#interrupt-cells");
|
||||
+ if (irq_provider && !prop) {
|
||||
+ FAIL(c, dti, node,
|
||||
+ "Missing '#interrupt-cells' in interrupt provider");
|
||||
return;
|
||||
+ }
|
||||
|
||||
- prop = get_property(node, "#interrupt-cells");
|
||||
- if (!prop)
|
||||
+ if (!irq_provider && prop) {
|
||||
FAIL(c, dti, node,
|
||||
- "Missing #interrupt-cells in interrupt provider");
|
||||
+ "'#interrupt-cells' found, but node is not an interrupt provider");
|
||||
+ return;
|
||||
+ }
|
||||
+}
|
||||
+WARNING(interrupt_provider, check_interrupt_provider, NULL, &interrupts_extended_is_cell);
|
||||
|
||||
- prop = get_property(node, "#address-cells");
|
||||
- if (!prop)
|
||||
+static void check_interrupt_map(struct check *c,
|
||||
+ struct dt_info *dti,
|
||||
+ struct node *node)
|
||||
+{
|
||||
+ struct node *root = dti->dt;
|
||||
+ struct property *prop, *irq_map_prop;
|
||||
+ size_t cellsize, cell, map_cells;
|
||||
+
|
||||
+ irq_map_prop = get_property(node, "interrupt-map");
|
||||
+ if (!irq_map_prop)
|
||||
+ return;
|
||||
+
|
||||
+ if (node->addr_cells < 0) {
|
||||
FAIL(c, dti, node,
|
||||
- "Missing #address-cells in interrupt provider");
|
||||
+ "Missing '#address-cells' in interrupt-map provider");
|
||||
+ return;
|
||||
+ }
|
||||
+ cellsize = node_addr_cells(node);
|
||||
+ cellsize += propval_cell(get_property(node, "#interrupt-cells"));
|
||||
+
|
||||
+ prop = get_property(node, "interrupt-map-mask");
|
||||
+ if (prop && (prop->val.len != (cellsize * sizeof(cell_t))))
|
||||
+ FAIL_PROP(c, dti, node, prop,
|
||||
+ "property size (%d) is invalid, expected %zu",
|
||||
+ prop->val.len, cellsize * sizeof(cell_t));
|
||||
+
|
||||
+ if (!is_multiple_of(irq_map_prop->val.len, sizeof(cell_t))) {
|
||||
+ FAIL_PROP(c, dti, node, irq_map_prop,
|
||||
+ "property size (%d) is invalid, expected multiple of %zu",
|
||||
+ irq_map_prop->val.len, sizeof(cell_t));
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
+ map_cells = irq_map_prop->val.len / sizeof(cell_t);
|
||||
+ for (cell = 0; cell < map_cells; ) {
|
||||
+ struct node *provider_node;
|
||||
+ struct property *cellprop;
|
||||
+ int phandle;
|
||||
+ size_t parent_cellsize;
|
||||
+
|
||||
+ if ((cell + cellsize) >= map_cells) {
|
||||
+ FAIL_PROP(c, dti, node, irq_map_prop,
|
||||
+ "property size (%d) too small, expected > %zu",
|
||||
+ irq_map_prop->val.len, (cell + cellsize) * sizeof(cell_t));
|
||||
+ break;
|
||||
+ }
|
||||
+ cell += cellsize;
|
||||
+
|
||||
+ phandle = propval_cell_n(irq_map_prop, cell);
|
||||
+ if (!phandle_is_valid(phandle)) {
|
||||
+ /* Give up if this is an overlay with external references */
|
||||
+ if (!(dti->dtsflags & DTSF_PLUGIN))
|
||||
+ FAIL_PROP(c, dti, node, irq_map_prop,
|
||||
+ "Cell %zu is not a phandle(%d)",
|
||||
+ cell, phandle);
|
||||
+ break;
|
||||
+ }
|
||||
+
|
||||
+ provider_node = get_node_by_phandle(root, phandle);
|
||||
+ if (!provider_node) {
|
||||
+ FAIL_PROP(c, dti, node, irq_map_prop,
|
||||
+ "Could not get phandle(%d) node for (cell %zu)",
|
||||
+ phandle, cell);
|
||||
+ break;
|
||||
+ }
|
||||
+
|
||||
+ cellprop = get_property(provider_node, "#interrupt-cells");
|
||||
+ if (cellprop) {
|
||||
+ parent_cellsize = propval_cell(cellprop);
|
||||
+ } else {
|
||||
+ FAIL(c, dti, node, "Missing property '#interrupt-cells' in node %s or bad phandle (referred from interrupt-map[%zu])",
|
||||
+ provider_node->fullpath, cell);
|
||||
+ break;
|
||||
+ }
|
||||
+
|
||||
+ cellprop = get_property(provider_node, "#address-cells");
|
||||
+ if (cellprop)
|
||||
+ parent_cellsize += propval_cell(cellprop);
|
||||
+
|
||||
+ cell += 1 + parent_cellsize;
|
||||
+ }
|
||||
}
|
||||
-WARNING(interrupt_provider, check_interrupt_provider, NULL);
|
||||
+WARNING(interrupt_map, check_interrupt_map, NULL, &phandle_references, &addr_size_cells, &interrupt_provider);
|
||||
|
||||
static void check_interrupts_property(struct check *c,
|
||||
struct dt_info *dti,
|
||||
@@ -1584,13 +1681,13 @@ static void check_interrupts_property(st
|
||||
struct node *root = dti->dt;
|
||||
struct node *irq_node = NULL, *parent = node;
|
||||
struct property *irq_prop, *prop = NULL;
|
||||
- int irq_cells, phandle;
|
||||
+ cell_t irq_cells, phandle;
|
||||
|
||||
irq_prop = get_property(node, "interrupts");
|
||||
if (!irq_prop)
|
||||
return;
|
||||
|
||||
- if (irq_prop->val.len % sizeof(cell_t))
|
||||
+ if (!is_multiple_of(irq_prop->val.len, sizeof(cell_t)))
|
||||
FAIL_PROP(c, dti, node, irq_prop, "size (%d) is invalid, expected multiple of %zu",
|
||||
irq_prop->val.len, sizeof(cell_t));
|
||||
|
||||
@@ -1603,7 +1700,7 @@ static void check_interrupts_property(st
|
||||
prop = get_property(parent, "interrupt-parent");
|
||||
if (prop) {
|
||||
phandle = propval_cell(prop);
|
||||
- if ((phandle == 0) || (phandle == -1)) {
|
||||
+ if (!phandle_is_valid(phandle)) {
|
||||
/* Give up if this is an overlay with
|
||||
* external references */
|
||||
if (dti->dtsflags & DTSF_PLUGIN)
|
||||
@@ -1639,7 +1736,7 @@ static void check_interrupts_property(st
|
||||
}
|
||||
|
||||
irq_cells = propval_cell(prop);
|
||||
- if (irq_prop->val.len % (irq_cells * sizeof(cell_t))) {
|
||||
+ if (!is_multiple_of(irq_prop->val.len, irq_cells * sizeof(cell_t))) {
|
||||
FAIL_PROP(c, dti, node, prop,
|
||||
"size is (%d), expected multiple of %d",
|
||||
irq_prop->val.len, (int)(irq_cells * sizeof(cell_t)));
|
||||
@@ -1750,7 +1847,7 @@ WARNING(graph_port, check_graph_port, NU
|
||||
static struct node *get_remote_endpoint(struct check *c, struct dt_info *dti,
|
||||
struct node *endpoint)
|
||||
{
|
||||
- int phandle;
|
||||
+ cell_t phandle;
|
||||
struct node *node;
|
||||
struct property *prop;
|
||||
|
||||
@@ -1760,7 +1857,7 @@ static struct node *get_remote_endpoint(
|
||||
|
||||
phandle = propval_cell(prop);
|
||||
/* Give up if this is an overlay with external references */
|
||||
- if (phandle == 0 || phandle == -1)
|
||||
+ if (!phandle_is_valid(phandle))
|
||||
return NULL;
|
||||
|
||||
node = get_node_by_phandle(dti->dt, phandle);
|
||||
@@ -1796,7 +1893,7 @@ WARNING(graph_endpoint, check_graph_endp
|
||||
static struct check *check_table[] = {
|
||||
&duplicate_node_names, &duplicate_property_names,
|
||||
&node_name_chars, &node_name_format, &property_name_chars,
|
||||
- &name_is_string, &name_properties,
|
||||
+ &name_is_string, &name_properties, &node_name_vs_property_name,
|
||||
|
||||
&duplicate_label,
|
||||
|
||||
@@ -1804,7 +1901,7 @@ static struct check *check_table[] = {
|
||||
&phandle_references, &path_references,
|
||||
&omit_unused_nodes,
|
||||
|
||||
- &address_cells_is_cell, &size_cells_is_cell, &interrupt_cells_is_cell,
|
||||
+ &address_cells_is_cell, &size_cells_is_cell,
|
||||
&device_type_is_string, &model_is_string, &status_is_string,
|
||||
&label_is_string,
|
||||
|
||||
@@ -1839,26 +1936,43 @@ static struct check *check_table[] = {
|
||||
&chosen_node_is_root, &chosen_node_bootargs, &chosen_node_stdout_path,
|
||||
|
||||
&clocks_property,
|
||||
+ &clocks_is_cell,
|
||||
&cooling_device_property,
|
||||
+ &cooling_device_is_cell,
|
||||
&dmas_property,
|
||||
+ &dmas_is_cell,
|
||||
&hwlocks_property,
|
||||
+ &hwlocks_is_cell,
|
||||
&interrupts_extended_property,
|
||||
+ &interrupts_extended_is_cell,
|
||||
&io_channels_property,
|
||||
+ &io_channels_is_cell,
|
||||
&iommus_property,
|
||||
+ &iommus_is_cell,
|
||||
&mboxes_property,
|
||||
+ &mboxes_is_cell,
|
||||
&msi_parent_property,
|
||||
+ &msi_parent_is_cell,
|
||||
&mux_controls_property,
|
||||
+ &mux_controls_is_cell,
|
||||
&phys_property,
|
||||
+ &phys_is_cell,
|
||||
&power_domains_property,
|
||||
+ &power_domains_is_cell,
|
||||
&pwms_property,
|
||||
+ &pwms_is_cell,
|
||||
&resets_property,
|
||||
+ &resets_is_cell,
|
||||
&sound_dai_property,
|
||||
+ &sound_dai_is_cell,
|
||||
&thermal_sensors_property,
|
||||
+ &thermal_sensors_is_cell,
|
||||
|
||||
&deprecated_gpio_property,
|
||||
&gpios_property,
|
||||
&interrupts_property,
|
||||
&interrupt_provider,
|
||||
+ &interrupt_map,
|
||||
|
||||
&alias_paths,
|
||||
|
||||
@@ -1882,7 +1996,7 @@ static void enable_warning_error(struct
|
||||
|
||||
static void disable_warning_error(struct check *c, bool warn, bool error)
|
||||
{
|
||||
- int i;
|
||||
+ unsigned int i;
|
||||
|
||||
/* Lowering level, also lower it for things this is the prereq
|
||||
* for */
|
||||
@@ -1903,7 +2017,7 @@ static void disable_warning_error(struct
|
||||
|
||||
void parse_checks_option(bool warn, bool error, const char *arg)
|
||||
{
|
||||
- int i;
|
||||
+ unsigned int i;
|
||||
const char *name = arg;
|
||||
bool enable = true;
|
||||
|
||||
@@ -1930,7 +2044,7 @@ void parse_checks_option(bool warn, bool
|
||||
|
||||
void process_checks(bool force, struct dt_info *dti)
|
||||
{
|
||||
- int i;
|
||||
+ unsigned int i;
|
||||
int error = 0;
|
||||
|
||||
for (i = 0; i < ARRAY_SIZE(check_table); i++) {
|
||||
--- a/scripts/dtc/dtc-lexer.l
|
||||
+++ b/scripts/dtc/dtc-lexer.l
|
||||
@@ -57,7 +57,7 @@ static void PRINTF(1, 2) lexical_error(c
|
||||
push_input_file(name);
|
||||
}
|
||||
|
||||
-<*>^"#"(line)?[ \t]+[0-9]+[ \t]+{STRING}([ \t]+[0-9]+)? {
|
||||
+<*>^"#"(line)?[ \t]+[0-9]+[ \t]+{STRING}([ \t]+[0-9]+)* {
|
||||
char *line, *fnstart, *fnend;
|
||||
struct data fn;
|
||||
/* skip text before line # */
|
||||
--- a/scripts/dtc/dtc.c
|
||||
+++ b/scripts/dtc/dtc.c
|
||||
@@ -12,7 +12,7 @@
|
||||
* Command line options
|
||||
*/
|
||||
int quiet; /* Level of quietness */
|
||||
-int reservenum; /* Number of memory reservation slots */
|
||||
+unsigned int reservenum;/* Number of memory reservation slots */
|
||||
int minsize; /* Minimum blob size */
|
||||
int padsize; /* Additional padding to blob */
|
||||
int alignsize; /* Additional padding to blob accroding to the alignsize */
|
||||
@@ -197,7 +197,7 @@ int main(int argc, char *argv[])
|
||||
depname = optarg;
|
||||
break;
|
||||
case 'R':
|
||||
- reservenum = strtol(optarg, NULL, 0);
|
||||
+ reservenum = strtoul(optarg, NULL, 0);
|
||||
break;
|
||||
case 'S':
|
||||
minsize = strtol(optarg, NULL, 0);
|
||||
@@ -359,8 +359,6 @@ int main(int argc, char *argv[])
|
||||
#endif
|
||||
} else if (streq(outform, "dtb")) {
|
||||
dt_to_blob(outf, dti, outversion);
|
||||
- } else if (streq(outform, "dtbo")) {
|
||||
- dt_to_blob(outf, dti, outversion);
|
||||
} else if (streq(outform, "asm")) {
|
||||
dt_to_asm(outf, dti, outversion);
|
||||
} else if (streq(outform, "null")) {
|
||||
--- a/scripts/dtc/dtc.h
|
||||
+++ b/scripts/dtc/dtc.h
|
||||
@@ -35,7 +35,7 @@
|
||||
* Command line options
|
||||
*/
|
||||
extern int quiet; /* Level of quietness */
|
||||
-extern int reservenum; /* Number of memory reservation slots */
|
||||
+extern unsigned int reservenum; /* Number of memory reservation slots */
|
||||
extern int minsize; /* Minimum blob size */
|
||||
extern int padsize; /* Additional padding to blob */
|
||||
extern int alignsize; /* Additional padding to blob accroding to the alignsize */
|
||||
@@ -51,6 +51,11 @@ extern int annotate; /* annotate .dts w
|
||||
|
||||
typedef uint32_t cell_t;
|
||||
|
||||
+static inline bool phandle_is_valid(cell_t phandle)
|
||||
+{
|
||||
+ return phandle != 0 && phandle != ~0U;
|
||||
+}
|
||||
+
|
||||
static inline uint16_t dtb_ld16(const void *p)
|
||||
{
|
||||
const uint8_t *bp = (const uint8_t *)p;
|
||||
@@ -86,6 +91,16 @@ static inline uint64_t dtb_ld64(const vo
|
||||
#define streq(a, b) (strcmp((a), (b)) == 0)
|
||||
#define strstarts(s, prefix) (strncmp((s), (prefix), strlen(prefix)) == 0)
|
||||
#define strprefixeq(a, n, b) (strlen(b) == (n) && (memcmp(a, b, n) == 0))
|
||||
+static inline bool strends(const char *str, const char *suffix)
|
||||
+{
|
||||
+ unsigned int len, suffix_len;
|
||||
+
|
||||
+ len = strlen(str);
|
||||
+ suffix_len = strlen(suffix);
|
||||
+ if (len < suffix_len)
|
||||
+ return false;
|
||||
+ return streq(str + len - suffix_len, suffix);
|
||||
+}
|
||||
|
||||
#define ALIGN(x, a) (((x) + (a) - 1) & ~((a) - 1))
|
||||
|
||||
@@ -101,6 +116,12 @@ enum markertype {
|
||||
TYPE_UINT64,
|
||||
TYPE_STRING,
|
||||
};
|
||||
+
|
||||
+static inline bool is_type_marker(enum markertype type)
|
||||
+{
|
||||
+ return type >= TYPE_UINT8;
|
||||
+}
|
||||
+
|
||||
extern const char *markername(enum markertype markertype);
|
||||
|
||||
struct marker {
|
||||
@@ -125,7 +146,22 @@ struct data {
|
||||
for_each_marker(m) \
|
||||
if ((m)->type == (t))
|
||||
|
||||
-size_t type_marker_length(struct marker *m);
|
||||
+static inline struct marker *next_type_marker(struct marker *m)
|
||||
+{
|
||||
+ for_each_marker(m)
|
||||
+ if (is_type_marker(m->type))
|
||||
+ break;
|
||||
+ return m;
|
||||
+}
|
||||
+
|
||||
+static inline size_t type_marker_length(struct marker *m)
|
||||
+{
|
||||
+ struct marker *next = next_type_marker(m->next);
|
||||
+
|
||||
+ if (next)
|
||||
+ return next->offset - m->offset;
|
||||
+ return 0;
|
||||
+}
|
||||
|
||||
void data_free(struct data d);
|
||||
|
||||
--- a/scripts/dtc/flattree.c
|
||||
+++ b/scripts/dtc/flattree.c
|
||||
@@ -124,7 +124,8 @@ static void asm_emit_cell(void *e, cell_
|
||||
{
|
||||
FILE *f = e;
|
||||
|
||||
- fprintf(f, "\t.byte 0x%02x; .byte 0x%02x; .byte 0x%02x; .byte 0x%02x\n",
|
||||
+ fprintf(f, "\t.byte\t0x%02x\n" "\t.byte\t0x%02x\n"
|
||||
+ "\t.byte\t0x%02x\n" "\t.byte\t0x%02x\n",
|
||||
(val >> 24) & 0xff, (val >> 16) & 0xff,
|
||||
(val >> 8) & 0xff, val & 0xff);
|
||||
}
|
||||
@@ -134,9 +135,9 @@ static void asm_emit_string(void *e, con
|
||||
FILE *f = e;
|
||||
|
||||
if (len != 0)
|
||||
- fprintf(f, "\t.string\t\"%.*s\"\n", len, str);
|
||||
+ fprintf(f, "\t.asciz\t\"%.*s\"\n", len, str);
|
||||
else
|
||||
- fprintf(f, "\t.string\t\"%s\"\n", str);
|
||||
+ fprintf(f, "\t.asciz\t\"%s\"\n", str);
|
||||
}
|
||||
|
||||
static void asm_emit_align(void *e, int a)
|
||||
@@ -295,7 +296,7 @@ static struct data flatten_reserve_list(
|
||||
{
|
||||
struct reserve_info *re;
|
||||
struct data d = empty_data;
|
||||
- int j;
|
||||
+ unsigned int j;
|
||||
|
||||
for (re = reservelist; re; re = re->next) {
|
||||
d = data_append_re(d, re->address, re->size);
|
||||
@@ -438,7 +439,7 @@ static void dump_stringtable_asm(FILE *f
|
||||
|
||||
while (p < (strbuf.val + strbuf.len)) {
|
||||
len = strlen(p);
|
||||
- fprintf(f, "\t.string \"%s\"\n", p);
|
||||
+ fprintf(f, "\t.asciz \"%s\"\n", p);
|
||||
p += len+1;
|
||||
}
|
||||
}
|
||||
--- a/scripts/dtc/libfdt/fdt.c
|
||||
+++ b/scripts/dtc/libfdt/fdt.c
|
||||
@@ -90,6 +90,10 @@ int fdt_check_header(const void *fdt)
|
||||
{
|
||||
size_t hdrsize;
|
||||
|
||||
+ /* The device tree must be at an 8-byte aligned address */
|
||||
+ if ((uintptr_t)fdt & 7)
|
||||
+ return -FDT_ERR_ALIGNMENT;
|
||||
+
|
||||
if (fdt_magic(fdt) != FDT_MAGIC)
|
||||
return -FDT_ERR_BADMAGIC;
|
||||
if (!can_assume(LATEST)) {
|
||||
--- a/scripts/dtc/libfdt/fdt_rw.c
|
||||
+++ b/scripts/dtc/libfdt/fdt_rw.c
|
||||
@@ -349,7 +349,10 @@ int fdt_add_subnode_namelen(void *fdt, i
|
||||
return offset;
|
||||
|
||||
/* Try to place the new node after the parent's properties */
|
||||
- fdt_next_tag(fdt, parentoffset, &nextoffset); /* skip the BEGIN_NODE */
|
||||
+ tag = fdt_next_tag(fdt, parentoffset, &nextoffset);
|
||||
+ /* the fdt_subnode_offset_namelen() should ensure this never hits */
|
||||
+ if (!can_assume(LIBFDT_FLAWLESS) && (tag != FDT_BEGIN_NODE))
|
||||
+ return -FDT_ERR_INTERNAL;
|
||||
do {
|
||||
offset = nextoffset;
|
||||
tag = fdt_next_tag(fdt, offset, &nextoffset);
|
||||
@@ -391,7 +394,9 @@ int fdt_del_node(void *fdt, int nodeoffs
|
||||
}
|
||||
|
||||
static void fdt_packblocks_(const char *old, char *new,
|
||||
- int mem_rsv_size, int struct_size)
|
||||
+ int mem_rsv_size,
|
||||
+ int struct_size,
|
||||
+ int strings_size)
|
||||
{
|
||||
int mem_rsv_off, struct_off, strings_off;
|
||||
|
||||
@@ -406,8 +411,7 @@ static void fdt_packblocks_(const char *
|
||||
fdt_set_off_dt_struct(new, struct_off);
|
||||
fdt_set_size_dt_struct(new, struct_size);
|
||||
|
||||
- memmove(new + strings_off, old + fdt_off_dt_strings(old),
|
||||
- fdt_size_dt_strings(old));
|
||||
+ memmove(new + strings_off, old + fdt_off_dt_strings(old), strings_size);
|
||||
fdt_set_off_dt_strings(new, strings_off);
|
||||
fdt_set_size_dt_strings(new, fdt_size_dt_strings(old));
|
||||
}
|
||||
@@ -467,7 +471,8 @@ int fdt_open_into(const void *fdt, void
|
||||
return -FDT_ERR_NOSPACE;
|
||||
}
|
||||
|
||||
- fdt_packblocks_(fdt, tmp, mem_rsv_size, struct_size);
|
||||
+ fdt_packblocks_(fdt, tmp, mem_rsv_size, struct_size,
|
||||
+ fdt_size_dt_strings(fdt));
|
||||
memmove(buf, tmp, newsize);
|
||||
|
||||
fdt_set_magic(buf, FDT_MAGIC);
|
||||
@@ -487,7 +492,8 @@ int fdt_pack(void *fdt)
|
||||
|
||||
mem_rsv_size = (fdt_num_mem_rsv(fdt)+1)
|
||||
* sizeof(struct fdt_reserve_entry);
|
||||
- fdt_packblocks_(fdt, fdt, mem_rsv_size, fdt_size_dt_struct(fdt));
|
||||
+ fdt_packblocks_(fdt, fdt, mem_rsv_size, fdt_size_dt_struct(fdt),
|
||||
+ fdt_size_dt_strings(fdt));
|
||||
fdt_set_totalsize(fdt, fdt_data_size_(fdt));
|
||||
|
||||
return 0;
|
||||
--- a/scripts/dtc/libfdt/fdt_strerror.c
|
||||
+++ b/scripts/dtc/libfdt/fdt_strerror.c
|
||||
@@ -39,6 +39,7 @@ static struct fdt_errtabent fdt_errtable
|
||||
FDT_ERRTABENT(FDT_ERR_BADOVERLAY),
|
||||
FDT_ERRTABENT(FDT_ERR_NOPHANDLES),
|
||||
FDT_ERRTABENT(FDT_ERR_BADFLAGS),
|
||||
+ FDT_ERRTABENT(FDT_ERR_ALIGNMENT),
|
||||
};
|
||||
#define FDT_ERRTABSIZE ((int)(sizeof(fdt_errtable) / sizeof(fdt_errtable[0])))
|
||||
|
||||
--- a/scripts/dtc/libfdt/libfdt.h
|
||||
+++ b/scripts/dtc/libfdt/libfdt.h
|
||||
@@ -131,6 +131,13 @@ uint32_t fdt_next_tag(const void *fdt, i
|
||||
* to work even with unaligned pointers on platforms (such as ARMv5) that don't
|
||||
* like unaligned loads and stores.
|
||||
*/
|
||||
+static inline uint16_t fdt16_ld(const fdt16_t *p)
|
||||
+{
|
||||
+ const uint8_t *bp = (const uint8_t *)p;
|
||||
+
|
||||
+ return ((uint16_t)bp[0] << 8) | bp[1];
|
||||
+}
|
||||
+
|
||||
static inline uint32_t fdt32_ld(const fdt32_t *p)
|
||||
{
|
||||
const uint8_t *bp = (const uint8_t *)p;
|
||||
--- a/scripts/dtc/livetree.c
|
||||
+++ b/scripts/dtc/livetree.c
|
||||
@@ -526,7 +526,7 @@ struct node *get_node_by_path(struct nod
|
||||
p = strchr(path, '/');
|
||||
|
||||
for_each_child(tree, child) {
|
||||
- if (p && strprefixeq(path, p - path, child->name))
|
||||
+ if (p && strprefixeq(path, (size_t)(p - path), child->name))
|
||||
return get_node_by_path(child, p+1);
|
||||
else if (!p && streq(path, child->name))
|
||||
return child;
|
||||
@@ -559,7 +559,7 @@ struct node *get_node_by_phandle(struct
|
||||
{
|
||||
struct node *child, *node;
|
||||
|
||||
- if ((phandle == 0) || (phandle == -1)) {
|
||||
+ if (!phandle_is_valid(phandle)) {
|
||||
assert(generate_fixups);
|
||||
return NULL;
|
||||
}
|
||||
@@ -594,7 +594,7 @@ cell_t get_node_phandle(struct node *roo
|
||||
static cell_t phandle = 1; /* FIXME: ick, static local */
|
||||
struct data d = empty_data;
|
||||
|
||||
- if ((node->phandle != 0) && (node->phandle != -1))
|
||||
+ if (phandle_is_valid(node->phandle))
|
||||
return node->phandle;
|
||||
|
||||
while (get_node_by_phandle(root, phandle))
|
||||
--- a/scripts/dtc/treesource.c
|
||||
+++ b/scripts/dtc/treesource.c
|
||||
@@ -124,27 +124,6 @@ static void write_propval_int(FILE *f, c
|
||||
}
|
||||
}
|
||||
|
||||
-static bool has_data_type_information(struct marker *m)
|
||||
-{
|
||||
- return m->type >= TYPE_UINT8;
|
||||
-}
|
||||
-
|
||||
-static struct marker *next_type_marker(struct marker *m)
|
||||
-{
|
||||
- while (m && !has_data_type_information(m))
|
||||
- m = m->next;
|
||||
- return m;
|
||||
-}
|
||||
-
|
||||
-size_t type_marker_length(struct marker *m)
|
||||
-{
|
||||
- struct marker *next = next_type_marker(m->next);
|
||||
-
|
||||
- if (next)
|
||||
- return next->offset - m->offset;
|
||||
- return 0;
|
||||
-}
|
||||
-
|
||||
static const char *delim_start[] = {
|
||||
[TYPE_UINT8] = "[",
|
||||
[TYPE_UINT16] = "/bits/ 16 <",
|
||||
@@ -229,26 +208,39 @@ static void write_propval(FILE *f, struc
|
||||
size_t chunk_len = (m->next ? m->next->offset : len) - m->offset;
|
||||
size_t data_len = type_marker_length(m) ? : len - m->offset;
|
||||
const char *p = &prop->val.val[m->offset];
|
||||
+ struct marker *m_phandle;
|
||||
|
||||
- if (has_data_type_information(m)) {
|
||||
+ if (is_type_marker(m->type)) {
|
||||
emit_type = m->type;
|
||||
fprintf(f, " %s", delim_start[emit_type]);
|
||||
} else if (m->type == LABEL)
|
||||
fprintf(f, " %s:", m->ref);
|
||||
- else if (m->offset)
|
||||
- fputc(' ', f);
|
||||
|
||||
- if (emit_type == TYPE_NONE) {
|
||||
- assert(chunk_len == 0);
|
||||
+ if (emit_type == TYPE_NONE || chunk_len == 0)
|
||||
continue;
|
||||
- }
|
||||
|
||||
switch(emit_type) {
|
||||
case TYPE_UINT16:
|
||||
write_propval_int(f, p, chunk_len, 2);
|
||||
break;
|
||||
case TYPE_UINT32:
|
||||
- write_propval_int(f, p, chunk_len, 4);
|
||||
+ m_phandle = prop->val.markers;
|
||||
+ for_each_marker_of_type(m_phandle, REF_PHANDLE)
|
||||
+ if (m->offset == m_phandle->offset)
|
||||
+ break;
|
||||
+
|
||||
+ if (m_phandle) {
|
||||
+ if (m_phandle->ref[0] == '/')
|
||||
+ fprintf(f, "&{%s}", m_phandle->ref);
|
||||
+ else
|
||||
+ fprintf(f, "&%s", m_phandle->ref);
|
||||
+ if (chunk_len > 4) {
|
||||
+ fputc(' ', f);
|
||||
+ write_propval_int(f, p + 4, chunk_len - 4, 4);
|
||||
+ }
|
||||
+ } else {
|
||||
+ write_propval_int(f, p, chunk_len, 4);
|
||||
+ }
|
||||
break;
|
||||
case TYPE_UINT64:
|
||||
write_propval_int(f, p, chunk_len, 8);
|
||||
--- a/scripts/dtc/util.h
|
||||
+++ b/scripts/dtc/util.h
|
||||
@@ -13,10 +13,10 @@
|
||||
*/
|
||||
|
||||
#ifdef __GNUC__
|
||||
-#ifdef __clang__
|
||||
-#define PRINTF(i, j) __attribute__((format (printf, i, j)))
|
||||
-#else
|
||||
+#if __GNUC__ >= 5 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 4)
|
||||
#define PRINTF(i, j) __attribute__((format (gnu_printf, i, j)))
|
||||
+#else
|
||||
+#define PRINTF(i, j) __attribute__((format (printf, i, j)))
|
||||
#endif
|
||||
#define NORETURN __attribute__((noreturn))
|
||||
#else
|
||||
--- a/scripts/dtc/version_gen.h
|
||||
+++ b/scripts/dtc/version_gen.h
|
||||
@@ -1 +1 @@
|
||||
-#define DTC_VERSION "DTC 1.6.0-g183df9e9"
|
||||
+#define DTC_VERSION "DTC 1.6.1-g0a3a9d34"
|
||||
--- a/scripts/dtc/yamltree.c
|
||||
+++ b/scripts/dtc/yamltree.c
|
||||
@@ -29,11 +29,12 @@ char *yaml_error_name[] = {
|
||||
(emitter)->problem, __func__, __LINE__); \
|
||||
})
|
||||
|
||||
-static void yaml_propval_int(yaml_emitter_t *emitter, struct marker *markers, char *data, unsigned int len, int width)
|
||||
+static void yaml_propval_int(yaml_emitter_t *emitter, struct marker *markers,
|
||||
+ char *data, unsigned int seq_offset, unsigned int len, int width)
|
||||
{
|
||||
yaml_event_t event;
|
||||
void *tag;
|
||||
- unsigned int off, start_offset = markers->offset;
|
||||
+ unsigned int off;
|
||||
|
||||
switch(width) {
|
||||
case 1: tag = "!u8"; break;
|
||||
@@ -66,7 +67,7 @@ static void yaml_propval_int(yaml_emitte
|
||||
m = markers;
|
||||
is_phandle = false;
|
||||
for_each_marker_of_type(m, REF_PHANDLE) {
|
||||
- if (m->offset == (start_offset + off)) {
|
||||
+ if (m->offset == (seq_offset + off)) {
|
||||
is_phandle = true;
|
||||
break;
|
||||
}
|
||||
@@ -114,6 +115,7 @@ static void yaml_propval(yaml_emitter_t
|
||||
yaml_event_t event;
|
||||
unsigned int len = prop->val.len;
|
||||
struct marker *m = prop->val.markers;
|
||||
+ struct marker *markers = prop->val.markers;
|
||||
|
||||
/* Emit the property name */
|
||||
yaml_scalar_event_initialize(&event, NULL,
|
||||
@@ -151,19 +153,19 @@ static void yaml_propval(yaml_emitter_t
|
||||
|
||||
switch(m->type) {
|
||||
case TYPE_UINT16:
|
||||
- yaml_propval_int(emitter, m, data, chunk_len, 2);
|
||||
+ yaml_propval_int(emitter, markers, data, m->offset, chunk_len, 2);
|
||||
break;
|
||||
case TYPE_UINT32:
|
||||
- yaml_propval_int(emitter, m, data, chunk_len, 4);
|
||||
+ yaml_propval_int(emitter, markers, data, m->offset, chunk_len, 4);
|
||||
break;
|
||||
case TYPE_UINT64:
|
||||
- yaml_propval_int(emitter, m, data, chunk_len, 8);
|
||||
+ yaml_propval_int(emitter, markers, data, m->offset, chunk_len, 8);
|
||||
break;
|
||||
case TYPE_STRING:
|
||||
yaml_propval_string(emitter, data, chunk_len);
|
||||
break;
|
||||
default:
|
||||
- yaml_propval_int(emitter, m, data, chunk_len, 1);
|
||||
+ yaml_propval_int(emitter, markers, data, m->offset, chunk_len, 1);
|
||||
break;
|
||||
}
|
||||
}
|
@ -0,0 +1,48 @@
|
||||
From: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
|
||||
To: linus.walleij@linaro.org
|
||||
Cc: bjorn.andersson@linaro.org, dianders@chromium.org,
|
||||
linux-arm-msm@vger.kernel.org, linux-gpio@vger.kernel.org,
|
||||
linux-kernel@vger.kernel.org,
|
||||
Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
|
||||
Subject: [PATCH] pinctrl: qcom: Return -EINVAL for setting affinity if no IRQ
|
||||
parent
|
||||
Date: Thu, 13 Jan 2022 21:56:17 +0530
|
||||
Message-Id: <20220113162617.131697-1-manivannan.sadhasivam@linaro.org>
|
||||
|
||||
The MSM GPIO IRQ controller relies on the parent IRQ controller to set the
|
||||
CPU affinity for the IRQ. And this is only valid if there is any wakeup
|
||||
parent available and defined in DT.
|
||||
|
||||
For the case of no parent IRQ controller defined in DT,
|
||||
msm_gpio_irq_set_affinity() and msm_gpio_irq_set_vcpu_affinity() should
|
||||
return -EINVAL instead of 0 as the affinity can't be set.
|
||||
|
||||
Otherwise, below warning will be printed by genirq:
|
||||
|
||||
genirq: irq_chip msmgpio did not update eff. affinity mask of irq 70
|
||||
|
||||
Signed-off-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
|
||||
---
|
||||
drivers/pinctrl/qcom/pinctrl-msm.c | 4 ++--
|
||||
1 file changed, 2 insertions(+), 2 deletions(-)
|
||||
|
||||
--- a/drivers/pinctrl/qcom/pinctrl-msm.c
|
||||
+++ b/drivers/pinctrl/qcom/pinctrl-msm.c
|
||||
@@ -1157,7 +1157,7 @@ static int msm_gpio_irq_set_affinity(str
|
||||
if (d->parent_data && test_bit(d->hwirq, pctrl->skip_wake_irqs))
|
||||
return irq_chip_set_affinity_parent(d, dest, force);
|
||||
|
||||
- return 0;
|
||||
+ return -EINVAL;
|
||||
}
|
||||
|
||||
static int msm_gpio_irq_set_vcpu_affinity(struct irq_data *d, void *vcpu_info)
|
||||
@@ -1168,7 +1168,7 @@ static int msm_gpio_irq_set_vcpu_affinit
|
||||
if (d->parent_data && test_bit(d->hwirq, pctrl->skip_wake_irqs))
|
||||
return irq_chip_set_vcpu_affinity_parent(d, vcpu_info);
|
||||
|
||||
- return 0;
|
||||
+ return -EINVAL;
|
||||
}
|
||||
|
||||
static void msm_gpio_irq_handler(struct irq_desc *desc)
|
@ -0,0 +1,166 @@
|
||||
From b5af64fceb04dc298c5e69c517b4d83893ff060b Mon Sep 17 00:00:00 2001
|
||||
From: Bjorn Andersson <bjorn.andersson@linaro.org>
|
||||
Date: Thu, 30 Sep 2021 11:21:10 -0700
|
||||
Subject: [PATCH 1/1] soc: qcom: smem: Support reserved-memory description
|
||||
|
||||
Practically all modern Qualcomm platforms has a single reserved-memory
|
||||
region for SMEM. So rather than having to describe SMEM in the form of a
|
||||
node with a reference to a reserved-memory node, allow the SMEM device
|
||||
to be instantiated directly from the reserved-memory node.
|
||||
|
||||
The current means of falling back to dereferencing the "memory-region"
|
||||
is kept as a fallback, if it's determined that the SMEM node is a
|
||||
reserved-memory node.
|
||||
|
||||
The "qcom,smem" compatible is added to the reserved_mem_matches list, to
|
||||
allow the reserved-memory device to be probed.
|
||||
|
||||
In order to retain the readability of the code, the resolution of
|
||||
resources is split from the actual ioremapping.
|
||||
|
||||
Signed-off-by: Bjorn Andersson <bjorn.andersson@linaro.org>
|
||||
Acked-by: Rob Herring <robh@kernel.org>
|
||||
Reviewed-by: Vladimir Zapolskiy <vladimir.zapolskiy@linaro.org>
|
||||
Link: https://lore.kernel.org/r/20210930182111.57353-4-bjorn.andersson@linaro.org
|
||||
---
|
||||
drivers/of/platform.c | 1 +
|
||||
drivers/soc/qcom/smem.c | 57 ++++++++++++++++++++++++++++-------------
|
||||
2 files changed, 40 insertions(+), 18 deletions(-)
|
||||
|
||||
--- a/drivers/of/platform.c
|
||||
+++ b/drivers/of/platform.c
|
||||
@@ -509,6 +509,7 @@ EXPORT_SYMBOL_GPL(of_platform_default_po
|
||||
static const struct of_device_id reserved_mem_matches[] = {
|
||||
{ .compatible = "qcom,rmtfs-mem" },
|
||||
{ .compatible = "qcom,cmd-db" },
|
||||
+ { .compatible = "qcom,smem" },
|
||||
{ .compatible = "ramoops" },
|
||||
{ .compatible = "nvmem-rmem" },
|
||||
{}
|
||||
--- a/drivers/soc/qcom/smem.c
|
||||
+++ b/drivers/soc/qcom/smem.c
|
||||
@@ -9,6 +9,7 @@
|
||||
#include <linux/module.h>
|
||||
#include <linux/of.h>
|
||||
#include <linux/of_address.h>
|
||||
+#include <linux/of_reserved_mem.h>
|
||||
#include <linux/platform_device.h>
|
||||
#include <linux/sizes.h>
|
||||
#include <linux/slab.h>
|
||||
@@ -240,7 +241,7 @@ static const u8 SMEM_INFO_MAGIC[] = { 0x
|
||||
* @size: size of the memory region
|
||||
*/
|
||||
struct smem_region {
|
||||
- u32 aux_base;
|
||||
+ phys_addr_t aux_base;
|
||||
void __iomem *virt_base;
|
||||
size_t size;
|
||||
};
|
||||
@@ -499,7 +500,7 @@ static void *qcom_smem_get_global(struct
|
||||
for (i = 0; i < smem->num_regions; i++) {
|
||||
region = &smem->regions[i];
|
||||
|
||||
- if (region->aux_base == aux_base || !aux_base) {
|
||||
+ if ((u32)region->aux_base == aux_base || !aux_base) {
|
||||
if (size != NULL)
|
||||
*size = le32_to_cpu(entry->size);
|
||||
return region->virt_base + le32_to_cpu(entry->offset);
|
||||
@@ -664,7 +665,7 @@ phys_addr_t qcom_smem_virt_to_phys(void
|
||||
if (p < region->virt_base + region->size) {
|
||||
u64 offset = p - region->virt_base;
|
||||
|
||||
- return (phys_addr_t)region->aux_base + offset;
|
||||
+ return region->aux_base + offset;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -863,12 +864,12 @@ qcom_smem_enumerate_partitions(struct qc
|
||||
return 0;
|
||||
}
|
||||
|
||||
-static int qcom_smem_map_memory(struct qcom_smem *smem, struct device *dev,
|
||||
- const char *name, int i)
|
||||
+static int qcom_smem_resolve_mem(struct qcom_smem *smem, const char *name,
|
||||
+ struct smem_region *region)
|
||||
{
|
||||
+ struct device *dev = smem->dev;
|
||||
struct device_node *np;
|
||||
struct resource r;
|
||||
- resource_size_t size;
|
||||
int ret;
|
||||
|
||||
np = of_parse_phandle(dev->of_node, name, 0);
|
||||
@@ -881,13 +882,9 @@ static int qcom_smem_map_memory(struct q
|
||||
of_node_put(np);
|
||||
if (ret)
|
||||
return ret;
|
||||
- size = resource_size(&r);
|
||||
|
||||
- smem->regions[i].virt_base = devm_ioremap_wc(dev, r.start, size);
|
||||
- if (!smem->regions[i].virt_base)
|
||||
- return -ENOMEM;
|
||||
- smem->regions[i].aux_base = (u32)r.start;
|
||||
- smem->regions[i].size = size;
|
||||
+ region->aux_base = r.start;
|
||||
+ region->size = resource_size(&r);
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -895,12 +892,14 @@ static int qcom_smem_map_memory(struct q
|
||||
static int qcom_smem_probe(struct platform_device *pdev)
|
||||
{
|
||||
struct smem_header *header;
|
||||
+ struct reserved_mem *rmem;
|
||||
struct qcom_smem *smem;
|
||||
size_t array_size;
|
||||
int num_regions;
|
||||
int hwlock_id;
|
||||
u32 version;
|
||||
int ret;
|
||||
+ int i;
|
||||
|
||||
num_regions = 1;
|
||||
if (of_find_property(pdev->dev.of_node, "qcom,rpm-msg-ram", NULL))
|
||||
@@ -914,13 +913,35 @@ static int qcom_smem_probe(struct platfo
|
||||
smem->dev = &pdev->dev;
|
||||
smem->num_regions = num_regions;
|
||||
|
||||
- ret = qcom_smem_map_memory(smem, &pdev->dev, "memory-region", 0);
|
||||
- if (ret)
|
||||
- return ret;
|
||||
-
|
||||
- if (num_regions > 1 && (ret = qcom_smem_map_memory(smem, &pdev->dev,
|
||||
- "qcom,rpm-msg-ram", 1)))
|
||||
- return ret;
|
||||
+ rmem = of_reserved_mem_lookup(pdev->dev.of_node);
|
||||
+ if (rmem) {
|
||||
+ smem->regions[0].aux_base = rmem->base;
|
||||
+ smem->regions[0].size = rmem->size;
|
||||
+ } else {
|
||||
+ /*
|
||||
+ * Fall back to the memory-region reference, if we're not a
|
||||
+ * reserved-memory node.
|
||||
+ */
|
||||
+ ret = qcom_smem_resolve_mem(smem, "memory-region", &smem->regions[0]);
|
||||
+ if (ret)
|
||||
+ return ret;
|
||||
+ }
|
||||
+
|
||||
+ if (num_regions > 1) {
|
||||
+ ret = qcom_smem_resolve_mem(smem, "qcom,rpm-msg-ram", &smem->regions[1]);
|
||||
+ if (ret)
|
||||
+ return ret;
|
||||
+ }
|
||||
+
|
||||
+ for (i = 0; i < num_regions; i++) {
|
||||
+ smem->regions[i].virt_base = devm_ioremap_wc(&pdev->dev,
|
||||
+ smem->regions[i].aux_base,
|
||||
+ smem->regions[i].size);
|
||||
+ if (!smem->regions[i].virt_base) {
|
||||
+ dev_err(&pdev->dev, "failed to remap %pa\n", &smem->regions[i].aux_base);
|
||||
+ return -ENOMEM;
|
||||
+ }
|
||||
+ }
|
||||
|
||||
header = smem->regions[0].virt_base;
|
||||
if (le32_to_cpu(header->initialized) != 1 ||
|
@ -0,0 +1,33 @@
|
||||
From ee1a0696934a8b77a6a2098f92832c46d34ec5da Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= <rafal@milecki.pl>
|
||||
Date: Wed, 27 Oct 2021 14:31:35 +0200
|
||||
Subject: [PATCH] watchdog: bcm63xx_wdt: fix fallthrough warning
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
This fixes:
|
||||
drivers/watchdog/bcm63xx_wdt.c: In function 'bcm63xx_wdt_ioctl':
|
||||
drivers/watchdog/bcm63xx_wdt.c:208:17: warning: this statement may fall through [-Wimplicit-fallthrough=]
|
||||
|
||||
Signed-off-by: Rafał Miłecki <rafal@milecki.pl>
|
||||
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
|
||||
Reviewed-by: Guenter Roeck <linux@roeck-us.net>
|
||||
Link: https://lore.kernel.org/r/20211027123135.27458-1-zajec5@gmail.com
|
||||
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
|
||||
Signed-off-by: Wim Van Sebroeck <wim@linux-watchdog.org>
|
||||
---
|
||||
drivers/watchdog/bcm63xx_wdt.c | 2 ++
|
||||
1 file changed, 2 insertions(+)
|
||||
|
||||
--- a/drivers/watchdog/bcm63xx_wdt.c
|
||||
+++ b/drivers/watchdog/bcm63xx_wdt.c
|
||||
@@ -207,6 +207,8 @@ static long bcm63xx_wdt_ioctl(struct fil
|
||||
|
||||
bcm63xx_wdt_pet();
|
||||
|
||||
+ fallthrough;
|
||||
+
|
||||
case WDIOC_GETTIMEOUT:
|
||||
return put_user(wdt_time, p);
|
||||
|
@ -0,0 +1,162 @@
|
||||
From 626bfa03729959ea9917181fb3d8ffaa1594d02a Mon Sep 17 00:00:00 2001
|
||||
From: Hauke Mehrtens <hauke@hauke-m.de>
|
||||
Date: Wed, 13 Oct 2021 22:40:18 -0700
|
||||
Subject: [PATCH 1/1] MIPS: kernel: proc: add CPU option reporting
|
||||
|
||||
Many MIPS CPUs have optional CPU features which are not activated for
|
||||
all CPU cores. Print the CPU options, which are implemented in the core,
|
||||
in /proc/cpuinfo. This makes it possible to see which features are
|
||||
supported and which are not supported. This should cover all standard
|
||||
MIPS extensions. Before, it only printed information about the main MIPS
|
||||
ASEs.
|
||||
|
||||
Signed-off-by: Hauke Mehrtens <hauke@hauke-m.de>
|
||||
|
||||
Changes from original patch[0]:
|
||||
- Remove cpu_has_6k_cache and cpu_has_8k_cache due to commit 6ce91ba8589a
|
||||
("MIPS: Remove cpu_has_6k_cache and cpu_has_8k_cache in cpu_cache_init()")
|
||||
- Add new options: mac2008_only, ftlbparex, gsexcex, mmid, mm_sysad,
|
||||
mm_full
|
||||
- Use seq_puts instead of seq_printf as suggested by checkpatch
|
||||
- Minor commit message reword
|
||||
|
||||
[0]: https://lore.kernel.org/linux-mips/20181223225224.23042-1-hauke@hauke-m.de/
|
||||
|
||||
Signed-off-by: Ilya Lipnitskiy <ilya.lipnitskiy@gmail.com>
|
||||
Acked-by: Hauke Mehrtens <hauke@hauke-m.de>
|
||||
Signed-off-by: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
|
||||
---
|
||||
arch/mips/kernel/proc.c | 122 ++++++++++++++++++++++++++++++++++++++++
|
||||
1 file changed, 122 insertions(+)
|
||||
|
||||
--- a/arch/mips/kernel/proc.c
|
||||
+++ b/arch/mips/kernel/proc.c
|
||||
@@ -138,6 +138,128 @@ static int show_cpuinfo(struct seq_file
|
||||
seq_printf(m, "micromips kernel\t: %s\n",
|
||||
(read_c0_config3() & MIPS_CONF3_ISA_OE) ? "yes" : "no");
|
||||
}
|
||||
+
|
||||
+ seq_puts(m, "Options implemented\t:");
|
||||
+ if (cpu_has_tlb)
|
||||
+ seq_puts(m, " tlb");
|
||||
+ if (cpu_has_ftlb)
|
||||
+ seq_puts(m, " ftlb");
|
||||
+ if (cpu_has_tlbinv)
|
||||
+ seq_puts(m, " tlbinv");
|
||||
+ if (cpu_has_segments)
|
||||
+ seq_puts(m, " segments");
|
||||
+ if (cpu_has_rixiex)
|
||||
+ seq_puts(m, " rixiex");
|
||||
+ if (cpu_has_ldpte)
|
||||
+ seq_puts(m, " ldpte");
|
||||
+ if (cpu_has_maar)
|
||||
+ seq_puts(m, " maar");
|
||||
+ if (cpu_has_rw_llb)
|
||||
+ seq_puts(m, " rw_llb");
|
||||
+ if (cpu_has_4kex)
|
||||
+ seq_puts(m, " 4kex");
|
||||
+ if (cpu_has_3k_cache)
|
||||
+ seq_puts(m, " 3k_cache");
|
||||
+ if (cpu_has_4k_cache)
|
||||
+ seq_puts(m, " 4k_cache");
|
||||
+ if (cpu_has_tx39_cache)
|
||||
+ seq_puts(m, " tx39_cache");
|
||||
+ if (cpu_has_octeon_cache)
|
||||
+ seq_puts(m, " octeon_cache");
|
||||
+ if (cpu_has_fpu)
|
||||
+ seq_puts(m, " fpu");
|
||||
+ if (cpu_has_32fpr)
|
||||
+ seq_puts(m, " 32fpr");
|
||||
+ if (cpu_has_cache_cdex_p)
|
||||
+ seq_puts(m, " cache_cdex_p");
|
||||
+ if (cpu_has_cache_cdex_s)
|
||||
+ seq_puts(m, " cache_cdex_s");
|
||||
+ if (cpu_has_prefetch)
|
||||
+ seq_puts(m, " prefetch");
|
||||
+ if (cpu_has_mcheck)
|
||||
+ seq_puts(m, " mcheck");
|
||||
+ if (cpu_has_ejtag)
|
||||
+ seq_puts(m, " ejtag");
|
||||
+ if (cpu_has_llsc)
|
||||
+ seq_puts(m, " llsc");
|
||||
+ if (cpu_has_guestctl0ext)
|
||||
+ seq_puts(m, " guestctl0ext");
|
||||
+ if (cpu_has_guestctl1)
|
||||
+ seq_puts(m, " guestctl1");
|
||||
+ if (cpu_has_guestctl2)
|
||||
+ seq_puts(m, " guestctl2");
|
||||
+ if (cpu_has_guestid)
|
||||
+ seq_puts(m, " guestid");
|
||||
+ if (cpu_has_drg)
|
||||
+ seq_puts(m, " drg");
|
||||
+ if (cpu_has_rixi)
|
||||
+ seq_puts(m, " rixi");
|
||||
+ if (cpu_has_lpa)
|
||||
+ seq_puts(m, " lpa");
|
||||
+ if (cpu_has_mvh)
|
||||
+ seq_puts(m, " mvh");
|
||||
+ if (cpu_has_vtag_icache)
|
||||
+ seq_puts(m, " vtag_icache");
|
||||
+ if (cpu_has_dc_aliases)
|
||||
+ seq_puts(m, " dc_aliases");
|
||||
+ if (cpu_has_ic_fills_f_dc)
|
||||
+ seq_puts(m, " ic_fills_f_dc");
|
||||
+ if (cpu_has_pindexed_dcache)
|
||||
+ seq_puts(m, " pindexed_dcache");
|
||||
+ if (cpu_has_userlocal)
|
||||
+ seq_puts(m, " userlocal");
|
||||
+ if (cpu_has_nofpuex)
|
||||
+ seq_puts(m, " nofpuex");
|
||||
+ if (cpu_has_vint)
|
||||
+ seq_puts(m, " vint");
|
||||
+ if (cpu_has_veic)
|
||||
+ seq_puts(m, " veic");
|
||||
+ if (cpu_has_inclusive_pcaches)
|
||||
+ seq_puts(m, " inclusive_pcaches");
|
||||
+ if (cpu_has_perf_cntr_intr_bit)
|
||||
+ seq_puts(m, " perf_cntr_intr_bit");
|
||||
+ if (cpu_has_ufr)
|
||||
+ seq_puts(m, " ufr");
|
||||
+ if (cpu_has_fre)
|
||||
+ seq_puts(m, " fre");
|
||||
+ if (cpu_has_cdmm)
|
||||
+ seq_puts(m, " cdmm");
|
||||
+ if (cpu_has_small_pages)
|
||||
+ seq_puts(m, " small_pages");
|
||||
+ if (cpu_has_nan_legacy)
|
||||
+ seq_puts(m, " nan_legacy");
|
||||
+ if (cpu_has_nan_2008)
|
||||
+ seq_puts(m, " nan_2008");
|
||||
+ if (cpu_has_ebase_wg)
|
||||
+ seq_puts(m, " ebase_wg");
|
||||
+ if (cpu_has_badinstr)
|
||||
+ seq_puts(m, " badinstr");
|
||||
+ if (cpu_has_badinstrp)
|
||||
+ seq_puts(m, " badinstrp");
|
||||
+ if (cpu_has_contextconfig)
|
||||
+ seq_puts(m, " contextconfig");
|
||||
+ if (cpu_has_perf)
|
||||
+ seq_puts(m, " perf");
|
||||
+ if (cpu_has_mac2008_only)
|
||||
+ seq_puts(m, " mac2008_only");
|
||||
+ if (cpu_has_ftlbparex)
|
||||
+ seq_puts(m, " ftlbparex");
|
||||
+ if (cpu_has_gsexcex)
|
||||
+ seq_puts(m, " gsexcex");
|
||||
+ if (cpu_has_shared_ftlb_ram)
|
||||
+ seq_puts(m, " shared_ftlb_ram");
|
||||
+ if (cpu_has_shared_ftlb_entries)
|
||||
+ seq_puts(m, " shared_ftlb_entries");
|
||||
+ if (cpu_has_mipsmt_pertccounters)
|
||||
+ seq_puts(m, " mipsmt_pertccounters");
|
||||
+ if (cpu_has_mmid)
|
||||
+ seq_puts(m, " mmid");
|
||||
+ if (cpu_has_mm_sysad)
|
||||
+ seq_puts(m, " mm_sysad");
|
||||
+ if (cpu_has_mm_full)
|
||||
+ seq_puts(m, " mm_full");
|
||||
+ seq_puts(m, "\n");
|
||||
+
|
||||
seq_printf(m, "shadow register sets\t: %d\n",
|
||||
cpu_data[n].srsets);
|
||||
seq_printf(m, "kscratch registers\t: %d\n",
|
@ -0,0 +1,62 @@
|
||||
From 1cab5bd69eb1f995ced2d7576cb15f8a8941fd85 Mon Sep 17 00:00:00 2001
|
||||
From: Tiezhu Yang <yangtiezhu@loongson.cn>
|
||||
Date: Thu, 25 Nov 2021 19:39:32 +0800
|
||||
Subject: [PATCH 1/1] MIPS: Fix using smp_processor_id() in preemptible in
|
||||
show_cpuinfo()
|
||||
|
||||
There exists the following issue under DEBUG_PREEMPT:
|
||||
|
||||
BUG: using smp_processor_id() in preemptible [00000000] code: systemd/1
|
||||
caller is show_cpuinfo+0x460/0xea0
|
||||
...
|
||||
Call Trace:
|
||||
[<ffffffff8020f0dc>] show_stack+0x94/0x128
|
||||
[<ffffffff80e6cab4>] dump_stack_lvl+0x94/0xd8
|
||||
[<ffffffff80e74c5c>] check_preemption_disabled+0x104/0x110
|
||||
[<ffffffff802209c8>] show_cpuinfo+0x460/0xea0
|
||||
[<ffffffff80539d54>] seq_read_iter+0xfc/0x4f8
|
||||
[<ffffffff804fcc10>] new_sync_read+0x110/0x1b8
|
||||
[<ffffffff804ff57c>] vfs_read+0x1b4/0x1d0
|
||||
[<ffffffff804ffb18>] ksys_read+0xd0/0x110
|
||||
[<ffffffff8021c090>] syscall_common+0x34/0x58
|
||||
|
||||
We can see the following call trace:
|
||||
show_cpuinfo()
|
||||
cpu_has_fpu
|
||||
current_cpu_data
|
||||
smp_processor_id()
|
||||
|
||||
$ addr2line -f -e vmlinux 0xffffffff802209c8
|
||||
show_cpuinfo
|
||||
arch/mips/kernel/proc.c:188
|
||||
|
||||
$ head -188 arch/mips/kernel/proc.c | tail -1
|
||||
if (cpu_has_fpu)
|
||||
|
||||
arch/mips/include/asm/cpu-features.h
|
||||
# define cpu_has_fpu (current_cpu_data.options & MIPS_CPU_FPU)
|
||||
|
||||
arch/mips/include/asm/cpu-info.h
|
||||
#define current_cpu_data cpu_data[smp_processor_id()]
|
||||
|
||||
Based on the above analysis, fix the issue by using raw_cpu_has_fpu
|
||||
which calls raw_smp_processor_id() in show_cpuinfo().
|
||||
|
||||
Fixes: 626bfa037299 ("MIPS: kernel: proc: add CPU option reporting")
|
||||
Signed-off-by: Tiezhu Yang <yangtiezhu@loongson.cn>
|
||||
Signed-off-by: Thomas Bogendoerfer <tsbogend@alpha.franken.de>
|
||||
---
|
||||
arch/mips/kernel/proc.c | 2 +-
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
--- a/arch/mips/kernel/proc.c
|
||||
+++ b/arch/mips/kernel/proc.c
|
||||
@@ -166,7 +166,7 @@ static int show_cpuinfo(struct seq_file
|
||||
seq_puts(m, " tx39_cache");
|
||||
if (cpu_has_octeon_cache)
|
||||
seq_puts(m, " octeon_cache");
|
||||
- if (cpu_has_fpu)
|
||||
+ if (raw_cpu_has_fpu)
|
||||
seq_puts(m, " fpu");
|
||||
if (cpu_has_32fpr)
|
||||
seq_puts(m, " 32fpr");
|
@ -0,0 +1,186 @@
|
||||
From f4c5c7f9d2e5ab005d57826b740b694b042a737c Mon Sep 17 00:00:00 2001
|
||||
From: Felix Matouschek <felix@matouschek.org>
|
||||
Date: Mon, 18 Apr 2022 15:28:03 +0200
|
||||
Subject: [PATCH 1/1] mtd: spinand: Add support for XTX XT26G0xA
|
||||
|
||||
Add support for XTX Technology XT26G01AXXXXX, XTX26G02AXXXXX and
|
||||
XTX26G04AXXXXX SPI NAND.
|
||||
|
||||
These are 3V, 1G/2G/4Gbit serial SLC NAND flash devices with on-die ECC
|
||||
(8bit strength per 512bytes).
|
||||
|
||||
Tested on Teltonika RUTX10 flashed with OpenWrt.
|
||||
|
||||
Links:
|
||||
- http://www.xtxtech.com/download/?AId=225
|
||||
- https://datasheet.lcsc.com/szlcsc/2005251034_XTX-XT26G01AWSEGA_C558841.pdf
|
||||
Signed-off-by: Felix Matouschek <felix@matouschek.org>
|
||||
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
|
||||
Link: https://lore.kernel.org/linux-mtd/20220418132803.664103-1-felix@matouschek.org
|
||||
---
|
||||
drivers/mtd/nand/spi/Makefile | 2 +-
|
||||
drivers/mtd/nand/spi/core.c | 1 +
|
||||
drivers/mtd/nand/spi/xtx.c | 129 ++++++++++++++++++++++++++++++++++
|
||||
include/linux/mtd/spinand.h | 1 +
|
||||
4 files changed, 132 insertions(+), 1 deletion(-)
|
||||
create mode 100644 drivers/mtd/nand/spi/xtx.c
|
||||
|
||||
--- a/drivers/mtd/nand/spi/Makefile
|
||||
+++ b/drivers/mtd/nand/spi/Makefile
|
||||
@@ -1,3 +1,3 @@
|
||||
# SPDX-License-Identifier: GPL-2.0
|
||||
-spinand-objs := core.o gigadevice.o macronix.o micron.o paragon.o toshiba.o winbond.o
|
||||
+spinand-objs := core.o gigadevice.o macronix.o micron.o paragon.o toshiba.o winbond.o xtx.o
|
||||
obj-$(CONFIG_MTD_SPI_NAND) += spinand.o
|
||||
--- a/drivers/mtd/nand/spi/core.c
|
||||
+++ b/drivers/mtd/nand/spi/core.c
|
||||
@@ -902,6 +902,7 @@ static const struct spinand_manufacturer
|
||||
¶gon_spinand_manufacturer,
|
||||
&toshiba_spinand_manufacturer,
|
||||
&winbond_spinand_manufacturer,
|
||||
+ &xtx_spinand_manufacturer,
|
||||
};
|
||||
|
||||
static int spinand_manufacturer_match(struct spinand_device *spinand,
|
||||
--- /dev/null
|
||||
+++ b/drivers/mtd/nand/spi/xtx.c
|
||||
@@ -0,0 +1,129 @@
|
||||
+// SPDX-License-Identifier: GPL-2.0
|
||||
+/*
|
||||
+ * Author:
|
||||
+ * Felix Matouschek <felix@matouschek.org>
|
||||
+ */
|
||||
+
|
||||
+#include <linux/device.h>
|
||||
+#include <linux/kernel.h>
|
||||
+#include <linux/mtd/spinand.h>
|
||||
+
|
||||
+#define SPINAND_MFR_XTX 0x0B
|
||||
+
|
||||
+#define XT26G0XA_STATUS_ECC_MASK GENMASK(5, 2)
|
||||
+#define XT26G0XA_STATUS_ECC_NO_DETECTED (0 << 2)
|
||||
+#define XT26G0XA_STATUS_ECC_8_CORRECTED (3 << 4)
|
||||
+#define XT26G0XA_STATUS_ECC_UNCOR_ERROR (2 << 4)
|
||||
+
|
||||
+static SPINAND_OP_VARIANTS(read_cache_variants,
|
||||
+ SPINAND_PAGE_READ_FROM_CACHE_QUADIO_OP(0, 1, NULL, 0),
|
||||
+ SPINAND_PAGE_READ_FROM_CACHE_X4_OP(0, 1, NULL, 0),
|
||||
+ SPINAND_PAGE_READ_FROM_CACHE_DUALIO_OP(0, 1, NULL, 0),
|
||||
+ SPINAND_PAGE_READ_FROM_CACHE_X2_OP(0, 1, NULL, 0),
|
||||
+ SPINAND_PAGE_READ_FROM_CACHE_OP(true, 0, 1, NULL, 0),
|
||||
+ SPINAND_PAGE_READ_FROM_CACHE_OP(false, 0, 1, NULL, 0));
|
||||
+
|
||||
+static SPINAND_OP_VARIANTS(write_cache_variants,
|
||||
+ SPINAND_PROG_LOAD_X4(true, 0, NULL, 0),
|
||||
+ SPINAND_PROG_LOAD(true, 0, NULL, 0));
|
||||
+
|
||||
+static SPINAND_OP_VARIANTS(update_cache_variants,
|
||||
+ SPINAND_PROG_LOAD_X4(false, 0, NULL, 0),
|
||||
+ SPINAND_PROG_LOAD(false, 0, NULL, 0));
|
||||
+
|
||||
+static int xt26g0xa_ooblayout_ecc(struct mtd_info *mtd, int section,
|
||||
+ struct mtd_oob_region *region)
|
||||
+{
|
||||
+ if (section)
|
||||
+ return -ERANGE;
|
||||
+
|
||||
+ region->offset = 48;
|
||||
+ region->length = 16;
|
||||
+
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
+static int xt26g0xa_ooblayout_free(struct mtd_info *mtd, int section,
|
||||
+ struct mtd_oob_region *region)
|
||||
+{
|
||||
+ if (section)
|
||||
+ return -ERANGE;
|
||||
+
|
||||
+ region->offset = 1;
|
||||
+ region->length = 47;
|
||||
+
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
+static const struct mtd_ooblayout_ops xt26g0xa_ooblayout = {
|
||||
+ .ecc = xt26g0xa_ooblayout_ecc,
|
||||
+ .free = xt26g0xa_ooblayout_free,
|
||||
+};
|
||||
+
|
||||
+static int xt26g0xa_ecc_get_status(struct spinand_device *spinand,
|
||||
+ u8 status)
|
||||
+{
|
||||
+ status = status & XT26G0XA_STATUS_ECC_MASK;
|
||||
+
|
||||
+ switch (status) {
|
||||
+ case XT26G0XA_STATUS_ECC_NO_DETECTED:
|
||||
+ return 0;
|
||||
+ case XT26G0XA_STATUS_ECC_8_CORRECTED:
|
||||
+ return 8;
|
||||
+ case XT26G0XA_STATUS_ECC_UNCOR_ERROR:
|
||||
+ return -EBADMSG;
|
||||
+ default:
|
||||
+ break;
|
||||
+ }
|
||||
+
|
||||
+ /* At this point values greater than (2 << 4) are invalid */
|
||||
+ if (status > XT26G0XA_STATUS_ECC_UNCOR_ERROR)
|
||||
+ return -EINVAL;
|
||||
+
|
||||
+ /* (1 << 2) through (7 << 2) are 1-7 corrected errors */
|
||||
+ return status >> 2;
|
||||
+}
|
||||
+
|
||||
+static const struct spinand_info xtx_spinand_table[] = {
|
||||
+ SPINAND_INFO("XT26G01A",
|
||||
+ SPINAND_ID(SPINAND_READID_METHOD_OPCODE_ADDR, 0xE1),
|
||||
+ NAND_MEMORG(1, 2048, 64, 64, 1024, 20, 1, 1, 1),
|
||||
+ NAND_ECCREQ(8, 512),
|
||||
+ SPINAND_INFO_OP_VARIANTS(&read_cache_variants,
|
||||
+ &write_cache_variants,
|
||||
+ &update_cache_variants),
|
||||
+ SPINAND_HAS_QE_BIT,
|
||||
+ SPINAND_ECCINFO(&xt26g0xa_ooblayout,
|
||||
+ xt26g0xa_ecc_get_status)),
|
||||
+ SPINAND_INFO("XT26G02A",
|
||||
+ SPINAND_ID(SPINAND_READID_METHOD_OPCODE_ADDR, 0xE2),
|
||||
+ NAND_MEMORG(1, 2048, 64, 64, 2048, 40, 1, 1, 1),
|
||||
+ NAND_ECCREQ(8, 512),
|
||||
+ SPINAND_INFO_OP_VARIANTS(&read_cache_variants,
|
||||
+ &write_cache_variants,
|
||||
+ &update_cache_variants),
|
||||
+ SPINAND_HAS_QE_BIT,
|
||||
+ SPINAND_ECCINFO(&xt26g0xa_ooblayout,
|
||||
+ xt26g0xa_ecc_get_status)),
|
||||
+ SPINAND_INFO("XT26G04A",
|
||||
+ SPINAND_ID(SPINAND_READID_METHOD_OPCODE_ADDR, 0xE3),
|
||||
+ NAND_MEMORG(1, 2048, 64, 128, 2048, 40, 1, 1, 1),
|
||||
+ NAND_ECCREQ(8, 512),
|
||||
+ SPINAND_INFO_OP_VARIANTS(&read_cache_variants,
|
||||
+ &write_cache_variants,
|
||||
+ &update_cache_variants),
|
||||
+ SPINAND_HAS_QE_BIT,
|
||||
+ SPINAND_ECCINFO(&xt26g0xa_ooblayout,
|
||||
+ xt26g0xa_ecc_get_status)),
|
||||
+};
|
||||
+
|
||||
+static const struct spinand_manufacturer_ops xtx_spinand_manuf_ops = {
|
||||
+};
|
||||
+
|
||||
+const struct spinand_manufacturer xtx_spinand_manufacturer = {
|
||||
+ .id = SPINAND_MFR_XTX,
|
||||
+ .name = "XTX",
|
||||
+ .chips = xtx_spinand_table,
|
||||
+ .nchips = ARRAY_SIZE(xtx_spinand_table),
|
||||
+ .ops = &xtx_spinand_manuf_ops,
|
||||
+};
|
||||
--- a/include/linux/mtd/spinand.h
|
||||
+++ b/include/linux/mtd/spinand.h
|
||||
@@ -266,6 +266,7 @@ extern const struct spinand_manufacturer
|
||||
extern const struct spinand_manufacturer paragon_spinand_manufacturer;
|
||||
extern const struct spinand_manufacturer toshiba_spinand_manufacturer;
|
||||
extern const struct spinand_manufacturer winbond_spinand_manufacturer;
|
||||
+extern const struct spinand_manufacturer xtx_spinand_manufacturer;
|
||||
|
||||
/**
|
||||
* struct spinand_op_variants - SPI NAND operation variants
|
@ -0,0 +1,219 @@
|
||||
From 4bf18d5a2dd02db8c5b16a2cfae513510506df5b Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Pali=20Roh=C3=A1r?= <pali@kernel.org>
|
||||
Date: Thu, 3 Feb 2022 22:44:40 +0100
|
||||
Subject: [PATCH 1/2] phy: marvell: phy-mvebu-a3700-comphy: Remove port from
|
||||
driver configuration
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
Port number is encoded into argument for SMC call. It is zero for SATA,
|
||||
PCIe and also both USB 3.0 PHYs. It is non-zero only for Ethernet PHY
|
||||
(incorrectly called SGMII) on lane 0. Ethernet PHY on lane 1 also uses zero
|
||||
port number.
|
||||
|
||||
So construct "port" bits for SMC call argument can be constructed directly
|
||||
from PHY type and lane number.
|
||||
|
||||
Change driver code to always pass zero port number for non-ethernet PHYs
|
||||
and for ethernet PHYs determinate port number from lane number. This
|
||||
simplifies the driver.
|
||||
|
||||
As port number from DT PHY configuration is not used anymore, remove whole
|
||||
driver code which parses it. This also simplifies the driver.
|
||||
|
||||
Signed-off-by: Pali Rohár <pali@kernel.org>
|
||||
Signed-off-by: Marek Behún <kabel@kernel.org>
|
||||
Reviewed-by: Miquel Raynal <miquel.raynal@bootlin.com>
|
||||
Link: https://lore.kernel.org/r/20220203214444.1508-2-kabel@kernel.org
|
||||
Signed-off-by: Vinod Koul <vkoul@kernel.org>
|
||||
---
|
||||
drivers/phy/marvell/phy-mvebu-a3700-comphy.c | 62 +++++++++-----------
|
||||
1 file changed, 29 insertions(+), 33 deletions(-)
|
||||
|
||||
--- a/drivers/phy/marvell/phy-mvebu-a3700-comphy.c
|
||||
+++ b/drivers/phy/marvell/phy-mvebu-a3700-comphy.c
|
||||
@@ -20,7 +20,6 @@
|
||||
#include <linux/platform_device.h>
|
||||
|
||||
#define MVEBU_A3700_COMPHY_LANES 3
|
||||
-#define MVEBU_A3700_COMPHY_PORTS 2
|
||||
|
||||
/* COMPHY Fast SMC function identifiers */
|
||||
#define COMPHY_SIP_POWER_ON 0x82000001
|
||||
@@ -45,51 +44,47 @@
|
||||
#define COMPHY_FW_NET(mode, idx, speed) (COMPHY_FW_MODE(mode) | \
|
||||
((idx) << 8) | \
|
||||
((speed) << 2))
|
||||
-#define COMPHY_FW_PCIE(mode, idx, speed, width) (COMPHY_FW_NET(mode, idx, speed) | \
|
||||
+#define COMPHY_FW_PCIE(mode, speed, width) (COMPHY_FW_NET(mode, 0, speed) | \
|
||||
((width) << 18))
|
||||
|
||||
struct mvebu_a3700_comphy_conf {
|
||||
unsigned int lane;
|
||||
enum phy_mode mode;
|
||||
int submode;
|
||||
- unsigned int port;
|
||||
u32 fw_mode;
|
||||
};
|
||||
|
||||
-#define MVEBU_A3700_COMPHY_CONF(_lane, _mode, _smode, _port, _fw) \
|
||||
+#define MVEBU_A3700_COMPHY_CONF(_lane, _mode, _smode, _fw) \
|
||||
{ \
|
||||
.lane = _lane, \
|
||||
.mode = _mode, \
|
||||
.submode = _smode, \
|
||||
- .port = _port, \
|
||||
.fw_mode = _fw, \
|
||||
}
|
||||
|
||||
-#define MVEBU_A3700_COMPHY_CONF_GEN(_lane, _mode, _port, _fw) \
|
||||
- MVEBU_A3700_COMPHY_CONF(_lane, _mode, PHY_INTERFACE_MODE_NA, _port, _fw)
|
||||
+#define MVEBU_A3700_COMPHY_CONF_GEN(_lane, _mode, _fw) \
|
||||
+ MVEBU_A3700_COMPHY_CONF(_lane, _mode, PHY_INTERFACE_MODE_NA, _fw)
|
||||
|
||||
-#define MVEBU_A3700_COMPHY_CONF_ETH(_lane, _smode, _port, _fw) \
|
||||
- MVEBU_A3700_COMPHY_CONF(_lane, PHY_MODE_ETHERNET, _smode, _port, _fw)
|
||||
+#define MVEBU_A3700_COMPHY_CONF_ETH(_lane, _smode, _fw) \
|
||||
+ MVEBU_A3700_COMPHY_CONF(_lane, PHY_MODE_ETHERNET, _smode, _fw)
|
||||
|
||||
static const struct mvebu_a3700_comphy_conf mvebu_a3700_comphy_modes[] = {
|
||||
/* lane 0 */
|
||||
- MVEBU_A3700_COMPHY_CONF_GEN(0, PHY_MODE_USB_HOST_SS, 0,
|
||||
+ MVEBU_A3700_COMPHY_CONF_GEN(0, PHY_MODE_USB_HOST_SS,
|
||||
COMPHY_FW_MODE_USB3H),
|
||||
- MVEBU_A3700_COMPHY_CONF_ETH(0, PHY_INTERFACE_MODE_SGMII, 1,
|
||||
+ MVEBU_A3700_COMPHY_CONF_ETH(0, PHY_INTERFACE_MODE_SGMII,
|
||||
COMPHY_FW_MODE_SGMII),
|
||||
- MVEBU_A3700_COMPHY_CONF_ETH(0, PHY_INTERFACE_MODE_2500BASEX, 1,
|
||||
+ MVEBU_A3700_COMPHY_CONF_ETH(0, PHY_INTERFACE_MODE_2500BASEX,
|
||||
COMPHY_FW_MODE_2500BASEX),
|
||||
/* lane 1 */
|
||||
- MVEBU_A3700_COMPHY_CONF_GEN(1, PHY_MODE_PCIE, 0,
|
||||
- COMPHY_FW_MODE_PCIE),
|
||||
- MVEBU_A3700_COMPHY_CONF_ETH(1, PHY_INTERFACE_MODE_SGMII, 0,
|
||||
+ MVEBU_A3700_COMPHY_CONF_GEN(1, PHY_MODE_PCIE, COMPHY_FW_MODE_PCIE),
|
||||
+ MVEBU_A3700_COMPHY_CONF_ETH(1, PHY_INTERFACE_MODE_SGMII,
|
||||
COMPHY_FW_MODE_SGMII),
|
||||
- MVEBU_A3700_COMPHY_CONF_ETH(1, PHY_INTERFACE_MODE_2500BASEX, 0,
|
||||
+ MVEBU_A3700_COMPHY_CONF_ETH(1, PHY_INTERFACE_MODE_2500BASEX,
|
||||
COMPHY_FW_MODE_2500BASEX),
|
||||
/* lane 2 */
|
||||
- MVEBU_A3700_COMPHY_CONF_GEN(2, PHY_MODE_SATA, 0,
|
||||
- COMPHY_FW_MODE_SATA),
|
||||
- MVEBU_A3700_COMPHY_CONF_GEN(2, PHY_MODE_USB_HOST_SS, 0,
|
||||
+ MVEBU_A3700_COMPHY_CONF_GEN(2, PHY_MODE_SATA, COMPHY_FW_MODE_SATA),
|
||||
+ MVEBU_A3700_COMPHY_CONF_GEN(2, PHY_MODE_USB_HOST_SS,
|
||||
COMPHY_FW_MODE_USB3H),
|
||||
};
|
||||
|
||||
@@ -98,7 +93,6 @@ struct mvebu_a3700_comphy_lane {
|
||||
unsigned int id;
|
||||
enum phy_mode mode;
|
||||
int submode;
|
||||
- int port;
|
||||
};
|
||||
|
||||
static int mvebu_a3700_comphy_smc(unsigned long function, unsigned long lane,
|
||||
@@ -120,7 +114,7 @@ static int mvebu_a3700_comphy_smc(unsign
|
||||
}
|
||||
}
|
||||
|
||||
-static int mvebu_a3700_comphy_get_fw_mode(int lane, int port,
|
||||
+static int mvebu_a3700_comphy_get_fw_mode(int lane,
|
||||
enum phy_mode mode,
|
||||
int submode)
|
||||
{
|
||||
@@ -132,7 +126,6 @@ static int mvebu_a3700_comphy_get_fw_mod
|
||||
|
||||
for (i = 0; i < n; i++) {
|
||||
if (mvebu_a3700_comphy_modes[i].lane == lane &&
|
||||
- mvebu_a3700_comphy_modes[i].port == port &&
|
||||
mvebu_a3700_comphy_modes[i].mode == mode &&
|
||||
mvebu_a3700_comphy_modes[i].submode == submode)
|
||||
break;
|
||||
@@ -153,7 +146,7 @@ static int mvebu_a3700_comphy_set_mode(s
|
||||
if (submode == PHY_INTERFACE_MODE_1000BASEX)
|
||||
submode = PHY_INTERFACE_MODE_SGMII;
|
||||
|
||||
- fw_mode = mvebu_a3700_comphy_get_fw_mode(lane->id, lane->port, mode,
|
||||
+ fw_mode = mvebu_a3700_comphy_get_fw_mode(lane->id, mode,
|
||||
submode);
|
||||
if (fw_mode < 0) {
|
||||
dev_err(lane->dev, "invalid COMPHY mode\n");
|
||||
@@ -172,9 +165,10 @@ static int mvebu_a3700_comphy_power_on(s
|
||||
struct mvebu_a3700_comphy_lane *lane = phy_get_drvdata(phy);
|
||||
u32 fw_param;
|
||||
int fw_mode;
|
||||
+ int fw_port;
|
||||
int ret;
|
||||
|
||||
- fw_mode = mvebu_a3700_comphy_get_fw_mode(lane->id, lane->port,
|
||||
+ fw_mode = mvebu_a3700_comphy_get_fw_mode(lane->id,
|
||||
lane->mode, lane->submode);
|
||||
if (fw_mode < 0) {
|
||||
dev_err(lane->dev, "invalid COMPHY mode\n");
|
||||
@@ -191,17 +185,18 @@ static int mvebu_a3700_comphy_power_on(s
|
||||
fw_param = COMPHY_FW_MODE(fw_mode);
|
||||
break;
|
||||
case PHY_MODE_ETHERNET:
|
||||
+ fw_port = (lane->id == 0) ? 1 : 0;
|
||||
switch (lane->submode) {
|
||||
case PHY_INTERFACE_MODE_SGMII:
|
||||
dev_dbg(lane->dev, "set lane %d to SGMII mode\n",
|
||||
lane->id);
|
||||
- fw_param = COMPHY_FW_NET(fw_mode, lane->port,
|
||||
+ fw_param = COMPHY_FW_NET(fw_mode, fw_port,
|
||||
COMPHY_FW_SPEED_1_25G);
|
||||
break;
|
||||
case PHY_INTERFACE_MODE_2500BASEX:
|
||||
dev_dbg(lane->dev, "set lane %d to 2500BASEX mode\n",
|
||||
lane->id);
|
||||
- fw_param = COMPHY_FW_NET(fw_mode, lane->port,
|
||||
+ fw_param = COMPHY_FW_NET(fw_mode, fw_port,
|
||||
COMPHY_FW_SPEED_3_125G);
|
||||
break;
|
||||
default:
|
||||
@@ -212,8 +207,7 @@ static int mvebu_a3700_comphy_power_on(s
|
||||
break;
|
||||
case PHY_MODE_PCIE:
|
||||
dev_dbg(lane->dev, "set lane %d to PCIe mode\n", lane->id);
|
||||
- fw_param = COMPHY_FW_PCIE(fw_mode, lane->port,
|
||||
- COMPHY_FW_SPEED_5G,
|
||||
+ fw_param = COMPHY_FW_PCIE(fw_mode, COMPHY_FW_SPEED_5G,
|
||||
phy->attrs.bus_width);
|
||||
break;
|
||||
default:
|
||||
@@ -247,17 +241,20 @@ static struct phy *mvebu_a3700_comphy_xl
|
||||
struct of_phandle_args *args)
|
||||
{
|
||||
struct mvebu_a3700_comphy_lane *lane;
|
||||
+ unsigned int port;
|
||||
struct phy *phy;
|
||||
|
||||
- if (WARN_ON(args->args[0] >= MVEBU_A3700_COMPHY_PORTS))
|
||||
- return ERR_PTR(-EINVAL);
|
||||
-
|
||||
phy = of_phy_simple_xlate(dev, args);
|
||||
if (IS_ERR(phy))
|
||||
return phy;
|
||||
|
||||
lane = phy_get_drvdata(phy);
|
||||
- lane->port = args->args[0];
|
||||
+
|
||||
+ port = args->args[0];
|
||||
+ if (port != 0 && (port != 1 || lane->id != 0)) {
|
||||
+ dev_err(lane->dev, "invalid port number %u\n", port);
|
||||
+ return ERR_PTR(-EINVAL);
|
||||
+ }
|
||||
|
||||
return phy;
|
||||
}
|
||||
@@ -302,7 +299,6 @@ static int mvebu_a3700_comphy_probe(stru
|
||||
lane->mode = PHY_MODE_INVALID;
|
||||
lane->submode = PHY_INTERFACE_MODE_NA;
|
||||
lane->id = lane_id;
|
||||
- lane->port = -1;
|
||||
phy_set_drvdata(phy, lane);
|
||||
}
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,32 @@
|
||||
From 73a78b6130d9e13daca22b86ad52f063b9403e03 Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Pali=20Roh=C3=A1r?= <pali@kernel.org>
|
||||
Date: Wed, 8 Dec 2021 03:40:35 +0100
|
||||
Subject: [PATCH 1/1] arm64: dts: marvell: armada-37xx: Add xtal clock to
|
||||
comphy node
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
Kernel driver phy-mvebu-a3700-comphy.c needs to know the rate of the
|
||||
reference xtal clock. So add missing xtal clock source into comphy device
|
||||
tree node. If the property is not present, the driver defaults to 25 MHz
|
||||
xtal rate (which, as far as we know, is used by all the existing boards).
|
||||
|
||||
Signed-off-by: Pali Rohár <pali@kernel.org>
|
||||
Signed-off-by: Marek Behún <kabel@kernel.org>
|
||||
Signed-off-by: Gregory CLEMENT <gregory.clement@bootlin.com>
|
||||
---
|
||||
arch/arm64/boot/dts/marvell/armada-37xx.dtsi | 2 ++
|
||||
1 file changed, 2 insertions(+)
|
||||
|
||||
--- a/arch/arm64/boot/dts/marvell/armada-37xx.dtsi
|
||||
+++ b/arch/arm64/boot/dts/marvell/armada-37xx.dtsi
|
||||
@@ -265,6 +265,8 @@
|
||||
"lane2_sata_usb3";
|
||||
#address-cells = <1>;
|
||||
#size-cells = <0>;
|
||||
+ clocks = <&xtalclk>;
|
||||
+ clock-names = "xtal";
|
||||
|
||||
comphy0: phy@0 {
|
||||
reg = <0>;
|
@ -0,0 +1,64 @@
|
||||
From ee995101fde67f85a3cd4c74f4f92fc4592e726b Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Pali=20Roh=C3=A1r?= <pali@kernel.org>
|
||||
Date: Thu, 3 Feb 2022 22:44:42 +0100
|
||||
Subject: [PATCH 1/3] Revert "ata: ahci: mvebu: Make SATA PHY optional for
|
||||
Armada 3720"
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
This reverts commit 45aefe3d2251e4e229d7662052739f96ad1d08d9.
|
||||
|
||||
Armada 3720 PHY driver (phy-mvebu-a3700-comphy.c) does not return
|
||||
-EOPNOTSUPP from phy_power_on() callback anymore.
|
||||
|
||||
So remove AHCI_HFLAG_IGN_NOTSUPP_POWER_ON flag from Armada 3720 plat data.
|
||||
|
||||
AHCI_HFLAG_IGN_NOTSUPP_POWER_ON is not used by any other ahci driver, so
|
||||
remove this flag completely.
|
||||
|
||||
Signed-off-by: Pali Rohár <pali@kernel.org>
|
||||
Signed-off-by: Marek Behún <kabel@kernel.org>
|
||||
Acked-by: Miquel Raynal <miquel.raynal@bootlin.com>
|
||||
Acked-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
|
||||
Link: https://lore.kernel.org/r/20220203214444.1508-4-kabel@kernel.org
|
||||
Signed-off-by: Vinod Koul <vkoul@kernel.org>
|
||||
---
|
||||
drivers/ata/ahci.h | 2 --
|
||||
drivers/ata/ahci_mvebu.c | 2 +-
|
||||
drivers/ata/libahci_platform.c | 2 +-
|
||||
3 files changed, 2 insertions(+), 4 deletions(-)
|
||||
|
||||
--- a/drivers/ata/ahci.h
|
||||
+++ b/drivers/ata/ahci.h
|
||||
@@ -240,8 +240,6 @@ enum {
|
||||
as default lpm_policy */
|
||||
AHCI_HFLAG_SUSPEND_PHYS = (1 << 26), /* handle PHYs during
|
||||
suspend/resume */
|
||||
- AHCI_HFLAG_IGN_NOTSUPP_POWER_ON = (1 << 27), /* ignore -EOPNOTSUPP
|
||||
- from phy_power_on() */
|
||||
AHCI_HFLAG_NO_SXS = (1 << 28), /* SXS not supported */
|
||||
|
||||
/* ap->flags bits */
|
||||
--- a/drivers/ata/ahci_mvebu.c
|
||||
+++ b/drivers/ata/ahci_mvebu.c
|
||||
@@ -227,7 +227,7 @@ static const struct ahci_mvebu_plat_data
|
||||
|
||||
static const struct ahci_mvebu_plat_data ahci_mvebu_armada_3700_plat_data = {
|
||||
.plat_config = ahci_mvebu_armada_3700_config,
|
||||
- .flags = AHCI_HFLAG_SUSPEND_PHYS | AHCI_HFLAG_IGN_NOTSUPP_POWER_ON,
|
||||
+ .flags = AHCI_HFLAG_SUSPEND_PHYS,
|
||||
};
|
||||
|
||||
static const struct of_device_id ahci_mvebu_of_match[] = {
|
||||
--- a/drivers/ata/libahci_platform.c
|
||||
+++ b/drivers/ata/libahci_platform.c
|
||||
@@ -59,7 +59,7 @@ int ahci_platform_enable_phys(struct ahc
|
||||
}
|
||||
|
||||
rc = phy_power_on(hpriv->phys[i]);
|
||||
- if (rc && !(rc == -EOPNOTSUPP && (hpriv->flags & AHCI_HFLAG_IGN_NOTSUPP_POWER_ON))) {
|
||||
+ if (rc) {
|
||||
phy_exit(hpriv->phys[i]);
|
||||
goto disable_phys;
|
||||
}
|
@ -0,0 +1,166 @@
|
||||
From 8e10548f7f4814e530857d2049d6af6bc78add53 Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Pali=20Roh=C3=A1r?= <pali@kernel.org>
|
||||
Date: Thu, 3 Feb 2022 22:44:43 +0100
|
||||
Subject: [PATCH 2/3] Revert "usb: host: xhci: mvebu: make USB 3.0 PHY optional
|
||||
for Armada 3720"
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
This reverts commit 3241929b67d28c83945d3191c6816a3271fd6b85.
|
||||
|
||||
Armada 3720 phy driver (phy-mvebu-a3700-comphy.c) does not return
|
||||
-EOPNOTSUPP from phy_power_on() callback anymore.
|
||||
|
||||
So remove XHCI_SKIP_PHY_INIT flag from xhci_mvebu_a3700_plat_setup() and
|
||||
then also whole xhci_mvebu_a3700_plat_setup() function which is there just
|
||||
to handle -EOPNOTSUPP for XHCI_SKIP_PHY_INIT.
|
||||
|
||||
xhci plat_setup callback is not used by any other xhci plat driver, so
|
||||
remove this callback completely.
|
||||
|
||||
Signed-off-by: Pali Rohár <pali@kernel.org>
|
||||
Signed-off-by: Marek Behún <kabel@kernel.org>
|
||||
Acked-by: Miquel Raynal <miquel.raynal@bootlin.com>
|
||||
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
|
||||
Link: https://lore.kernel.org/r/20220203214444.1508-5-kabel@kernel.org
|
||||
Signed-off-by: Vinod Koul <vkoul@kernel.org>
|
||||
---
|
||||
drivers/usb/host/xhci-mvebu.c | 42 -----------------------------------
|
||||
drivers/usb/host/xhci-mvebu.h | 6 -----
|
||||
drivers/usb/host/xhci-plat.c | 20 +----------------
|
||||
drivers/usb/host/xhci-plat.h | 1 -
|
||||
4 files changed, 1 insertion(+), 68 deletions(-)
|
||||
|
||||
--- a/drivers/usb/host/xhci-mvebu.c
|
||||
+++ b/drivers/usb/host/xhci-mvebu.c
|
||||
@@ -8,7 +8,6 @@
|
||||
#include <linux/mbus.h>
|
||||
#include <linux/of.h>
|
||||
#include <linux/platform_device.h>
|
||||
-#include <linux/phy/phy.h>
|
||||
|
||||
#include <linux/usb.h>
|
||||
#include <linux/usb/hcd.h>
|
||||
@@ -74,47 +73,6 @@ int xhci_mvebu_mbus_init_quirk(struct us
|
||||
|
||||
return 0;
|
||||
}
|
||||
-
|
||||
-int xhci_mvebu_a3700_plat_setup(struct usb_hcd *hcd)
|
||||
-{
|
||||
- struct xhci_hcd *xhci = hcd_to_xhci(hcd);
|
||||
- struct device *dev = hcd->self.controller;
|
||||
- struct phy *phy;
|
||||
- int ret;
|
||||
-
|
||||
- /* Old bindings miss the PHY handle */
|
||||
- phy = of_phy_get(dev->of_node, "usb3-phy");
|
||||
- if (IS_ERR(phy) && PTR_ERR(phy) == -EPROBE_DEFER)
|
||||
- return -EPROBE_DEFER;
|
||||
- else if (IS_ERR(phy))
|
||||
- goto phy_out;
|
||||
-
|
||||
- ret = phy_init(phy);
|
||||
- if (ret)
|
||||
- goto phy_put;
|
||||
-
|
||||
- ret = phy_set_mode(phy, PHY_MODE_USB_HOST_SS);
|
||||
- if (ret)
|
||||
- goto phy_exit;
|
||||
-
|
||||
- ret = phy_power_on(phy);
|
||||
- if (ret == -EOPNOTSUPP) {
|
||||
- /* Skip initializatin of XHCI PHY when it is unsupported by firmware */
|
||||
- dev_warn(dev, "PHY unsupported by firmware\n");
|
||||
- xhci->quirks |= XHCI_SKIP_PHY_INIT;
|
||||
- }
|
||||
- if (ret)
|
||||
- goto phy_exit;
|
||||
-
|
||||
- phy_power_off(phy);
|
||||
-phy_exit:
|
||||
- phy_exit(phy);
|
||||
-phy_put:
|
||||
- of_phy_put(phy);
|
||||
-phy_out:
|
||||
-
|
||||
- return 0;
|
||||
-}
|
||||
|
||||
int xhci_mvebu_a3700_init_quirk(struct usb_hcd *hcd)
|
||||
{
|
||||
--- a/drivers/usb/host/xhci-mvebu.h
|
||||
+++ b/drivers/usb/host/xhci-mvebu.h
|
||||
@@ -12,18 +12,12 @@ struct usb_hcd;
|
||||
|
||||
#if IS_ENABLED(CONFIG_USB_XHCI_MVEBU)
|
||||
int xhci_mvebu_mbus_init_quirk(struct usb_hcd *hcd);
|
||||
-int xhci_mvebu_a3700_plat_setup(struct usb_hcd *hcd);
|
||||
int xhci_mvebu_a3700_init_quirk(struct usb_hcd *hcd);
|
||||
#else
|
||||
static inline int xhci_mvebu_mbus_init_quirk(struct usb_hcd *hcd)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
-
|
||||
-static inline int xhci_mvebu_a3700_plat_setup(struct usb_hcd *hcd)
|
||||
-{
|
||||
- return 0;
|
||||
-}
|
||||
|
||||
static inline int xhci_mvebu_a3700_init_quirk(struct usb_hcd *hcd)
|
||||
{
|
||||
--- a/drivers/usb/host/xhci-plat.c
|
||||
+++ b/drivers/usb/host/xhci-plat.c
|
||||
@@ -44,16 +44,6 @@ static void xhci_priv_plat_start(struct
|
||||
priv->plat_start(hcd);
|
||||
}
|
||||
|
||||
-static int xhci_priv_plat_setup(struct usb_hcd *hcd)
|
||||
-{
|
||||
- struct xhci_plat_priv *priv = hcd_to_xhci_priv(hcd);
|
||||
-
|
||||
- if (!priv->plat_setup)
|
||||
- return 0;
|
||||
-
|
||||
- return priv->plat_setup(hcd);
|
||||
-}
|
||||
-
|
||||
static int xhci_priv_init_quirk(struct usb_hcd *hcd)
|
||||
{
|
||||
struct xhci_plat_priv *priv = hcd_to_xhci_priv(hcd);
|
||||
@@ -121,7 +111,6 @@ static const struct xhci_plat_priv xhci_
|
||||
};
|
||||
|
||||
static const struct xhci_plat_priv xhci_plat_marvell_armada3700 = {
|
||||
- .plat_setup = xhci_mvebu_a3700_plat_setup,
|
||||
.init_quirk = xhci_mvebu_a3700_init_quirk,
|
||||
};
|
||||
|
||||
@@ -341,14 +330,7 @@ static int xhci_plat_probe(struct platfo
|
||||
|
||||
hcd->tpl_support = of_usb_host_tpl_support(sysdev->of_node);
|
||||
xhci->shared_hcd->tpl_support = hcd->tpl_support;
|
||||
-
|
||||
- if (priv) {
|
||||
- ret = xhci_priv_plat_setup(hcd);
|
||||
- if (ret)
|
||||
- goto disable_usb_phy;
|
||||
- }
|
||||
-
|
||||
- if ((xhci->quirks & XHCI_SKIP_PHY_INIT) || (priv && (priv->quirks & XHCI_SKIP_PHY_INIT)))
|
||||
+ if (priv && (priv->quirks & XHCI_SKIP_PHY_INIT))
|
||||
hcd->skip_phy_initialization = 1;
|
||||
|
||||
if (priv && (priv->quirks & XHCI_SG_TRB_CACHE_SIZE_QUIRK))
|
||||
--- a/drivers/usb/host/xhci-plat.h
|
||||
+++ b/drivers/usb/host/xhci-plat.h
|
||||
@@ -13,7 +13,6 @@
|
||||
struct xhci_plat_priv {
|
||||
const char *firmware_name;
|
||||
unsigned long long quirks;
|
||||
- int (*plat_setup)(struct usb_hcd *);
|
||||
void (*plat_start)(struct usb_hcd *);
|
||||
int (*init_quirk)(struct usb_hcd *);
|
||||
int (*suspend_quirk)(struct usb_hcd *);
|
@ -0,0 +1,39 @@
|
||||
From 9a4556dad7bd0a6b8339cb72e169f5c76f2af6f1 Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Pali=20Roh=C3=A1r?= <pali@kernel.org>
|
||||
Date: Thu, 3 Feb 2022 22:44:44 +0100
|
||||
Subject: [PATCH 3/3] Revert "PCI: aardvark: Fix initialization with old
|
||||
Marvell's Arm Trusted Firmware"
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
This reverts commit b0c6ae0f8948a2be6bf4e8b4bbab9ca1343289b6.
|
||||
|
||||
Armada 3720 phy driver (phy-mvebu-a3700-comphy.c) does not return
|
||||
-EOPNOTSUPP from phy_power_on() callback anymore.
|
||||
|
||||
So remove dead code which handles -EOPNOTSUPP return value.
|
||||
|
||||
Signed-off-by: Pali Rohár <pali@kernel.org>
|
||||
Signed-off-by: Marek Behún <kabel@kernel.org>
|
||||
Acked-by: Miquel Raynal <miquel.raynal@bootlin.com>
|
||||
Acked-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
|
||||
Link: https://lore.kernel.org/r/20220203214444.1508-6-kabel@kernel.org
|
||||
Signed-off-by: Vinod Koul <vkoul@kernel.org>
|
||||
---
|
||||
drivers/pci/controller/pci-aardvark.c | 4 +---
|
||||
1 file changed, 1 insertion(+), 3 deletions(-)
|
||||
|
||||
--- a/drivers/pci/controller/pci-aardvark.c
|
||||
+++ b/drivers/pci/controller/pci-aardvark.c
|
||||
@@ -1642,9 +1642,7 @@ static int advk_pcie_enable_phy(struct a
|
||||
}
|
||||
|
||||
ret = phy_power_on(pcie->phy);
|
||||
- if (ret == -EOPNOTSUPP) {
|
||||
- dev_warn(&pcie->pdev->dev, "PHY unsupported by firmware\n");
|
||||
- } else if (ret) {
|
||||
+ if (ret) {
|
||||
phy_exit(pcie->phy);
|
||||
return ret;
|
||||
}
|
@ -0,0 +1,194 @@
|
||||
From 0a6fc70d76bddf98278af2ac000379c82aec8f11 Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Pali=20Roh=C3=A1r?= <pali@kernel.org>
|
||||
Date: Mon, 29 Aug 2022 10:30:46 +0200
|
||||
Subject: [PATCH] phy: marvell: phy-mvebu-a3700-comphy: Remove broken reset
|
||||
support
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
Reset support for SATA PHY is somehow broken and after calling it, kernel
|
||||
is not able to detect and initialize SATA disk Samsung SSD 850 EMT0 [1].
|
||||
|
||||
Reset support was introduced in commit 934337080c6c ("phy: marvell:
|
||||
phy-mvebu-a3700-comphy: Add native kernel implementation") as part of
|
||||
complete rewrite of this driver. v1 patch series of that commit [2] did
|
||||
not contain reset support and was tested that is working fine with
|
||||
Ethernet, SATA and USB PHYs without issues too.
|
||||
|
||||
So for now remove broken reset support and change implementation of
|
||||
power_off callback to power off all functions on specified lane (and not
|
||||
only selected function) because during startup kernel does not know which
|
||||
function was selected and configured by bootloader. Same logic was used
|
||||
also in v1 patch series of that commit.
|
||||
|
||||
This change fixes issues with initialization of SATA disk Samsung SSD 850
|
||||
and disk is working again, like before mentioned commit.
|
||||
|
||||
Once problem with PHY reset callback is solved its functionality could be
|
||||
re-introduced. But for now it is unknown why it does not work.
|
||||
|
||||
[1] - https://lore.kernel.org/r/20220531124159.3e4lgn2v462irbtz@shindev/
|
||||
[2] - https://lore.kernel.org/r/20211028184242.22105-1-kabel@kernel.org/
|
||||
|
||||
Reported-by: Shinichiro Kawasaki <shinichiro.kawasaki@wdc.com>
|
||||
Fixes: 934337080c6c ("phy: marvell: phy-mvebu-a3700-comphy: Add native kernel implementation")
|
||||
Cc: stable@vger.kernel.org # v5.18+
|
||||
Signed-off-by: Pali Rohár <pali@kernel.org>
|
||||
Tested-by: Shinichiro Kawasaki <shinichiro.kawasaki@wdc.com>
|
||||
Link: https://lore.kernel.org/r/20220829083046.15082-1-pali@kernel.org
|
||||
Signed-off-by: Vinod Koul <vkoul@kernel.org>
|
||||
---
|
||||
drivers/phy/marvell/phy-mvebu-a3700-comphy.c | 87 ++++----------------
|
||||
1 file changed, 17 insertions(+), 70 deletions(-)
|
||||
|
||||
--- a/drivers/phy/marvell/phy-mvebu-a3700-comphy.c
|
||||
+++ b/drivers/phy/marvell/phy-mvebu-a3700-comphy.c
|
||||
@@ -274,7 +274,6 @@ struct mvebu_a3700_comphy_lane {
|
||||
int submode;
|
||||
bool invert_tx;
|
||||
bool invert_rx;
|
||||
- bool needs_reset;
|
||||
};
|
||||
|
||||
struct gbe_phy_init_data_fix {
|
||||
@@ -1097,40 +1096,12 @@ mvebu_a3700_comphy_pcie_power_off(struct
|
||||
0x0, PU_PLL_BIT | PU_RX_BIT | PU_TX_BIT);
|
||||
}
|
||||
|
||||
-static int mvebu_a3700_comphy_reset(struct phy *phy)
|
||||
+static void mvebu_a3700_comphy_usb3_power_off(struct mvebu_a3700_comphy_lane *lane)
|
||||
{
|
||||
- struct mvebu_a3700_comphy_lane *lane = phy_get_drvdata(phy);
|
||||
- u16 mask, data;
|
||||
-
|
||||
- dev_dbg(lane->dev, "resetting lane %d\n", lane->id);
|
||||
-
|
||||
- /* COMPHY reset for internal logic */
|
||||
- comphy_lane_reg_set(lane, COMPHY_SFT_RESET,
|
||||
- SFT_RST_NO_REG, SFT_RST_NO_REG);
|
||||
-
|
||||
- /* COMPHY register reset (cleared automatically) */
|
||||
- comphy_lane_reg_set(lane, COMPHY_SFT_RESET, SFT_RST, SFT_RST);
|
||||
-
|
||||
- /* PIPE soft and register reset */
|
||||
- data = PIPE_SOFT_RESET | PIPE_REG_RESET;
|
||||
- mask = data;
|
||||
- comphy_lane_reg_set(lane, COMPHY_PIPE_RST_CLK_CTRL, data, mask);
|
||||
-
|
||||
- /* Release PIPE register reset */
|
||||
- comphy_lane_reg_set(lane, COMPHY_PIPE_RST_CLK_CTRL,
|
||||
- 0x0, PIPE_REG_RESET);
|
||||
-
|
||||
- /* Reset SB configuration register (only for lanes 0 and 1) */
|
||||
- if (lane->id == 0 || lane->id == 1) {
|
||||
- u32 mask, data;
|
||||
-
|
||||
- data = PIN_RESET_CORE_BIT | PIN_RESET_COMPHY_BIT |
|
||||
- PIN_PU_PLL_BIT | PIN_PU_RX_BIT | PIN_PU_TX_BIT;
|
||||
- mask = data | PIN_PU_IVREF_BIT | PIN_TX_IDLE_BIT;
|
||||
- comphy_periph_reg_set(lane, COMPHY_PHY_CFG1, data, mask);
|
||||
- }
|
||||
-
|
||||
- return 0;
|
||||
+ /*
|
||||
+ * The USB3 MAC sets the USB3 PHY to low state, so we do not
|
||||
+ * need to power off USB3 PHY again.
|
||||
+ */
|
||||
}
|
||||
|
||||
static bool mvebu_a3700_comphy_check_mode(int lane,
|
||||
@@ -1171,10 +1142,6 @@ static int mvebu_a3700_comphy_set_mode(s
|
||||
(lane->mode != mode || lane->submode != submode))
|
||||
return -EBUSY;
|
||||
|
||||
- /* If changing mode, ensure reset is called */
|
||||
- if (lane->mode != PHY_MODE_INVALID && lane->mode != mode)
|
||||
- lane->needs_reset = true;
|
||||
-
|
||||
/* Just remember the mode, ->power_on() will do the real setup */
|
||||
lane->mode = mode;
|
||||
lane->submode = submode;
|
||||
@@ -1185,7 +1152,6 @@ static int mvebu_a3700_comphy_set_mode(s
|
||||
static int mvebu_a3700_comphy_power_on(struct phy *phy)
|
||||
{
|
||||
struct mvebu_a3700_comphy_lane *lane = phy_get_drvdata(phy);
|
||||
- int ret;
|
||||
|
||||
if (!mvebu_a3700_comphy_check_mode(lane->id, lane->mode,
|
||||
lane->submode)) {
|
||||
@@ -1193,14 +1159,6 @@ static int mvebu_a3700_comphy_power_on(s
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
- if (lane->needs_reset) {
|
||||
- ret = mvebu_a3700_comphy_reset(phy);
|
||||
- if (ret)
|
||||
- return ret;
|
||||
-
|
||||
- lane->needs_reset = false;
|
||||
- }
|
||||
-
|
||||
switch (lane->mode) {
|
||||
case PHY_MODE_USB_HOST_SS:
|
||||
dev_dbg(lane->dev, "set lane %d to USB3 host mode\n", lane->id);
|
||||
@@ -1224,38 +1182,28 @@ static int mvebu_a3700_comphy_power_off(
|
||||
{
|
||||
struct mvebu_a3700_comphy_lane *lane = phy_get_drvdata(phy);
|
||||
|
||||
- switch (lane->mode) {
|
||||
- case PHY_MODE_USB_HOST_SS:
|
||||
- /*
|
||||
- * The USB3 MAC sets the USB3 PHY to low state, so we do not
|
||||
- * need to power off USB3 PHY again.
|
||||
- */
|
||||
- break;
|
||||
-
|
||||
- case PHY_MODE_SATA:
|
||||
- mvebu_a3700_comphy_sata_power_off(lane);
|
||||
- break;
|
||||
-
|
||||
- case PHY_MODE_ETHERNET:
|
||||
+ switch (lane->id) {
|
||||
+ case 0:
|
||||
+ mvebu_a3700_comphy_usb3_power_off(lane);
|
||||
mvebu_a3700_comphy_ethernet_power_off(lane);
|
||||
- break;
|
||||
-
|
||||
- case PHY_MODE_PCIE:
|
||||
+ return 0;
|
||||
+ case 1:
|
||||
mvebu_a3700_comphy_pcie_power_off(lane);
|
||||
- break;
|
||||
-
|
||||
+ mvebu_a3700_comphy_ethernet_power_off(lane);
|
||||
+ return 0;
|
||||
+ case 2:
|
||||
+ mvebu_a3700_comphy_usb3_power_off(lane);
|
||||
+ mvebu_a3700_comphy_sata_power_off(lane);
|
||||
+ return 0;
|
||||
default:
|
||||
dev_err(lane->dev, "invalid COMPHY mode\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
-
|
||||
- return 0;
|
||||
}
|
||||
|
||||
static const struct phy_ops mvebu_a3700_comphy_ops = {
|
||||
.power_on = mvebu_a3700_comphy_power_on,
|
||||
.power_off = mvebu_a3700_comphy_power_off,
|
||||
- .reset = mvebu_a3700_comphy_reset,
|
||||
.set_mode = mvebu_a3700_comphy_set_mode,
|
||||
.owner = THIS_MODULE,
|
||||
};
|
||||
@@ -1393,8 +1341,7 @@ static int mvebu_a3700_comphy_probe(stru
|
||||
* To avoid relying on the bootloader/firmware configuration,
|
||||
* power off all comphys.
|
||||
*/
|
||||
- mvebu_a3700_comphy_reset(phy);
|
||||
- lane->needs_reset = false;
|
||||
+ mvebu_a3700_comphy_power_off(phy);
|
||||
}
|
||||
|
||||
provider = devm_of_phy_provider_register(&pdev->dev,
|
@ -0,0 +1,90 @@
|
||||
From 86fc59ef818beb0e1945d17f8e734898baba7e4e Mon Sep 17 00:00:00 2001
|
||||
From: Colin Foster <colin.foster@in-advantage.com>
|
||||
Date: Sun, 13 Mar 2022 15:45:23 -0700
|
||||
Subject: [PATCH 1/2] regmap: add configurable downshift for addresses
|
||||
|
||||
Add an additional reg_downshift to be applied to register addresses before
|
||||
any register accesses. An example of a device that uses this is a VSC7514
|
||||
chip, which require each register address to be downshifted by two if the
|
||||
access is performed over a SPI bus.
|
||||
|
||||
Signed-off-by: Colin Foster <colin.foster@in-advantage.com>
|
||||
Link: https://lore.kernel.org/r/20220313224524.399947-2-colin.foster@in-advantage.com
|
||||
Signed-off-by: Mark Brown <broonie@kernel.org>
|
||||
---
|
||||
drivers/base/regmap/internal.h | 1 +
|
||||
drivers/base/regmap/regmap.c | 5 +++++
|
||||
include/linux/regmap.h | 3 +++
|
||||
3 files changed, 9 insertions(+)
|
||||
|
||||
--- a/drivers/base/regmap/internal.h
|
||||
+++ b/drivers/base/regmap/internal.h
|
||||
@@ -31,6 +31,7 @@ struct regmap_format {
|
||||
size_t buf_size;
|
||||
size_t reg_bytes;
|
||||
size_t pad_bytes;
|
||||
+ size_t reg_downshift;
|
||||
size_t val_bytes;
|
||||
void (*format_write)(struct regmap *map,
|
||||
unsigned int reg, unsigned int val);
|
||||
--- a/drivers/base/regmap/regmap.c
|
||||
+++ b/drivers/base/regmap/regmap.c
|
||||
@@ -823,6 +823,7 @@ struct regmap *__regmap_init(struct devi
|
||||
|
||||
map->format.reg_bytes = DIV_ROUND_UP(config->reg_bits, 8);
|
||||
map->format.pad_bytes = config->pad_bits / 8;
|
||||
+ map->format.reg_downshift = config->reg_downshift;
|
||||
map->format.val_bytes = DIV_ROUND_UP(config->val_bits, 8);
|
||||
map->format.buf_size = DIV_ROUND_UP(config->reg_bits +
|
||||
config->val_bits + config->pad_bits, 8);
|
||||
@@ -1735,6 +1736,7 @@ static int _regmap_raw_write_impl(struct
|
||||
return ret;
|
||||
}
|
||||
|
||||
+ reg >>= map->format.reg_downshift;
|
||||
map->format.format_reg(map->work_buf, reg, map->reg_shift);
|
||||
regmap_set_work_buf_flag_mask(map, map->format.reg_bytes,
|
||||
map->write_flag_mask);
|
||||
@@ -1905,6 +1907,7 @@ static int _regmap_bus_formatted_write(v
|
||||
return ret;
|
||||
}
|
||||
|
||||
+ reg >>= map->format.reg_downshift;
|
||||
map->format.format_write(map, reg, val);
|
||||
|
||||
trace_regmap_hw_write_start(map, reg, 1);
|
||||
@@ -2346,6 +2349,7 @@ static int _regmap_raw_multi_reg_write(s
|
||||
unsigned int reg = regs[i].reg;
|
||||
unsigned int val = regs[i].def;
|
||||
trace_regmap_hw_write_start(map, reg, 1);
|
||||
+ reg >>= map->format.reg_downshift;
|
||||
map->format.format_reg(u8, reg, map->reg_shift);
|
||||
u8 += reg_bytes + pad_bytes;
|
||||
map->format.format_val(u8, val, 0);
|
||||
@@ -2673,6 +2677,7 @@ static int _regmap_raw_read(struct regma
|
||||
return ret;
|
||||
}
|
||||
|
||||
+ reg >>= map->format.reg_downshift;
|
||||
map->format.format_reg(map->work_buf, reg, map->reg_shift);
|
||||
regmap_set_work_buf_flag_mask(map, map->format.reg_bytes,
|
||||
map->read_flag_mask);
|
||||
--- a/include/linux/regmap.h
|
||||
+++ b/include/linux/regmap.h
|
||||
@@ -237,6 +237,8 @@ typedef void (*regmap_unlock)(void *);
|
||||
* @reg_stride: The register address stride. Valid register addresses are a
|
||||
* multiple of this value. If set to 0, a value of 1 will be
|
||||
* used.
|
||||
+ * @reg_downshift: The number of bits to downshift the register before
|
||||
+ * performing any operations.
|
||||
* @pad_bits: Number of bits of padding between register and value.
|
||||
* @val_bits: Number of bits in a register value, mandatory.
|
||||
*
|
||||
@@ -360,6 +362,7 @@ struct regmap_config {
|
||||
|
||||
int reg_bits;
|
||||
int reg_stride;
|
||||
+ int reg_downshift;
|
||||
int pad_bits;
|
||||
int val_bits;
|
||||
|
@ -0,0 +1,95 @@
|
||||
From 0074f3f2b1e43d3cedd97e47fb6980db6d2ba79e Mon Sep 17 00:00:00 2001
|
||||
From: Colin Foster <colin.foster@in-advantage.com>
|
||||
Date: Sun, 13 Mar 2022 15:45:24 -0700
|
||||
Subject: [PATCH 2/2] regmap: allow a defined reg_base to be added to every
|
||||
address
|
||||
|
||||
There's an inconsistency that arises when a register set can be accessed
|
||||
internally via MMIO, or externally via SPI. The VSC7514 chip allows both
|
||||
modes of operation. When internally accessed, the system utilizes __iomem,
|
||||
devm_ioremap_resource, and devm_regmap_init_mmio.
|
||||
|
||||
For SPI it isn't possible to utilize memory-mapped IO. To properly operate,
|
||||
the resource base must be added to the register before every operation.
|
||||
|
||||
Signed-off-by: Colin Foster <colin.foster@in-advantage.com>
|
||||
Link: https://lore.kernel.org/r/20220313224524.399947-3-colin.foster@in-advantage.com
|
||||
Signed-off-by: Mark Brown <broonie@kernel.org>
|
||||
---
|
||||
drivers/base/regmap/internal.h | 1 +
|
||||
drivers/base/regmap/regmap.c | 6 ++++++
|
||||
include/linux/regmap.h | 3 +++
|
||||
3 files changed, 10 insertions(+)
|
||||
|
||||
--- a/drivers/base/regmap/internal.h
|
||||
+++ b/drivers/base/regmap/internal.h
|
||||
@@ -63,6 +63,7 @@ struct regmap {
|
||||
regmap_unlock unlock;
|
||||
void *lock_arg; /* This is passed to lock/unlock functions */
|
||||
gfp_t alloc_flags;
|
||||
+ unsigned int reg_base;
|
||||
|
||||
struct device *dev; /* Device we do I/O on */
|
||||
void *work_buf; /* Scratch buffer used to format I/O */
|
||||
--- a/drivers/base/regmap/regmap.c
|
||||
+++ b/drivers/base/regmap/regmap.c
|
||||
@@ -821,6 +821,8 @@ struct regmap *__regmap_init(struct devi
|
||||
else
|
||||
map->alloc_flags = GFP_KERNEL;
|
||||
|
||||
+ map->reg_base = config->reg_base;
|
||||
+
|
||||
map->format.reg_bytes = DIV_ROUND_UP(config->reg_bits, 8);
|
||||
map->format.pad_bytes = config->pad_bits / 8;
|
||||
map->format.reg_downshift = config->reg_downshift;
|
||||
@@ -1736,6 +1738,7 @@ static int _regmap_raw_write_impl(struct
|
||||
return ret;
|
||||
}
|
||||
|
||||
+ reg += map->reg_base;
|
||||
reg >>= map->format.reg_downshift;
|
||||
map->format.format_reg(map->work_buf, reg, map->reg_shift);
|
||||
regmap_set_work_buf_flag_mask(map, map->format.reg_bytes,
|
||||
@@ -1907,6 +1910,7 @@ static int _regmap_bus_formatted_write(v
|
||||
return ret;
|
||||
}
|
||||
|
||||
+ reg += map->reg_base;
|
||||
reg >>= map->format.reg_downshift;
|
||||
map->format.format_write(map, reg, val);
|
||||
|
||||
@@ -2349,6 +2353,7 @@ static int _regmap_raw_multi_reg_write(s
|
||||
unsigned int reg = regs[i].reg;
|
||||
unsigned int val = regs[i].def;
|
||||
trace_regmap_hw_write_start(map, reg, 1);
|
||||
+ reg += map->reg_base;
|
||||
reg >>= map->format.reg_downshift;
|
||||
map->format.format_reg(u8, reg, map->reg_shift);
|
||||
u8 += reg_bytes + pad_bytes;
|
||||
@@ -2677,6 +2682,7 @@ static int _regmap_raw_read(struct regma
|
||||
return ret;
|
||||
}
|
||||
|
||||
+ reg += map->reg_base;
|
||||
reg >>= map->format.reg_downshift;
|
||||
map->format.format_reg(map->work_buf, reg, map->reg_shift);
|
||||
regmap_set_work_buf_flag_mask(map, map->format.reg_bytes,
|
||||
--- a/include/linux/regmap.h
|
||||
+++ b/include/linux/regmap.h
|
||||
@@ -239,6 +239,8 @@ typedef void (*regmap_unlock)(void *);
|
||||
* used.
|
||||
* @reg_downshift: The number of bits to downshift the register before
|
||||
* performing any operations.
|
||||
+ * @reg_base: Value to be added to every register address before performing any
|
||||
+ * operation.
|
||||
* @pad_bits: Number of bits of padding between register and value.
|
||||
* @val_bits: Number of bits in a register value, mandatory.
|
||||
*
|
||||
@@ -363,6 +365,7 @@ struct regmap_config {
|
||||
int reg_bits;
|
||||
int reg_stride;
|
||||
int reg_downshift;
|
||||
+ unsigned int reg_base;
|
||||
int pad_bits;
|
||||
int val_bits;
|
||||
|
@ -0,0 +1,57 @@
|
||||
From 697c3892d825fb78f42ec8e53bed065dd728db3e Mon Sep 17 00:00:00 2001
|
||||
From: Daniel Golle <daniel@makrotopia.org>
|
||||
Date: Mon, 30 Jan 2023 02:04:57 +0000
|
||||
Subject: [PATCH] regmap: apply reg_base and reg_downshift for single register
|
||||
ops
|
||||
|
||||
reg_base and reg_downshift currently don't have any effect if used with
|
||||
a regmap_bus or regmap_config which only offers single register
|
||||
operations (ie. reg_read, reg_write and optionally reg_update_bits).
|
||||
|
||||
Fix that and take them into account also for regmap_bus with only
|
||||
reg_read and read_write operations by applying reg_base and
|
||||
reg_downshift in _regmap_bus_reg_write, _regmap_bus_reg_read.
|
||||
|
||||
Also apply reg_base and reg_downshift in _regmap_update_bits, but only
|
||||
in case the operation is carried out with a reg_update_bits call
|
||||
defined in either regmap_bus or regmap_config.
|
||||
|
||||
Fixes: 0074f3f2b1e43d ("regmap: allow a defined reg_base to be added to every address")
|
||||
Fixes: 86fc59ef818beb ("regmap: add configurable downshift for addresses")
|
||||
Signed-off-by: Daniel Golle <daniel@makrotopia.org>
|
||||
Tested-by: Colin Foster <colin.foster@in-advantage.com>
|
||||
Link: https://lore.kernel.org/r/Y9clyVS3tQEHlUhA@makrotopia.org
|
||||
Signed-off-by: Mark Brown <broonie@kernel.org>
|
||||
---
|
||||
drivers/base/regmap/regmap.c | 6 ++++++
|
||||
1 file changed, 6 insertions(+)
|
||||
|
||||
--- a/drivers/base/regmap/regmap.c
|
||||
+++ b/drivers/base/regmap/regmap.c
|
||||
@@ -1929,6 +1929,8 @@ static int _regmap_bus_reg_write(void *c
|
||||
{
|
||||
struct regmap *map = context;
|
||||
|
||||
+ reg += map->reg_base;
|
||||
+ reg >>= map->format.reg_downshift;
|
||||
return map->bus->reg_write(map->bus_context, reg, val);
|
||||
}
|
||||
|
||||
@@ -2703,6 +2705,8 @@ static int _regmap_bus_reg_read(void *co
|
||||
{
|
||||
struct regmap *map = context;
|
||||
|
||||
+ reg += map->reg_base;
|
||||
+ reg >>= map->format.reg_downshift;
|
||||
return map->bus->reg_read(map->bus_context, reg, val);
|
||||
}
|
||||
|
||||
@@ -3078,6 +3082,8 @@ static int _regmap_update_bits(struct re
|
||||
*change = false;
|
||||
|
||||
if (regmap_volatile(map, reg) && map->reg_update_bits) {
|
||||
+ reg += map->reg_base;
|
||||
+ reg >>= map->format.reg_downshift;
|
||||
ret = map->reg_update_bits(map->bus_context, reg, mask, val);
|
||||
if (ret == 0 && change)
|
||||
*change = true;
|
@ -0,0 +1,72 @@
|
||||
From bcdf0315a61a29eb753a607d3a85a4032de72d94 Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= <rafal@milecki.pl>
|
||||
Date: Tue, 10 May 2022 15:12:59 +0200
|
||||
Subject: [PATCH] mtd: call of_platform_populate() for MTD partitions
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
Until this change MTD subsystem supported handling partitions only with
|
||||
MTD partitions parsers. That's a specific / limited API designed around
|
||||
partitions.
|
||||
|
||||
Some MTD partitions may however require different handling. They may
|
||||
contain specific data that needs to be parsed and somehow extracted. For
|
||||
that purpose MTD subsystem should allow binding of standard platform
|
||||
drivers.
|
||||
|
||||
An example can be U-Boot (sub)partition with environment variables.
|
||||
There exist a "u-boot,env" DT binding for MTD (sub)partition that
|
||||
requires an NVMEM driver.
|
||||
|
||||
Ref: 5db1c2dbc04c ("dt-bindings: nvmem: add U-Boot environment variables binding")
|
||||
Signed-off-by: Rafał Miłecki <rafal@milecki.pl>
|
||||
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
|
||||
Link: https://lore.kernel.org/linux-mtd/20220510131259.555-1-zajec5@gmail.com
|
||||
---
|
||||
drivers/mtd/mtdpart.c | 9 +++++++++
|
||||
1 file changed, 9 insertions(+)
|
||||
|
||||
--- a/drivers/mtd/mtdpart.c
|
||||
+++ b/drivers/mtd/mtdpart.c
|
||||
@@ -17,6 +17,7 @@
|
||||
#include <linux/mtd/partitions.h>
|
||||
#include <linux/err.h>
|
||||
#include <linux/of.h>
|
||||
+#include <linux/of_platform.h>
|
||||
|
||||
#include "mtdcore.h"
|
||||
|
||||
@@ -577,10 +578,16 @@ static int mtd_part_of_parse(struct mtd_
|
||||
struct mtd_part_parser *parser;
|
||||
struct device_node *np;
|
||||
struct property *prop;
|
||||
+ struct device *dev;
|
||||
const char *compat;
|
||||
const char *fixed = "fixed-partitions";
|
||||
int ret, err = 0;
|
||||
|
||||
+ dev = &master->dev;
|
||||
+ /* Use parent device (controller) if the top level MTD is not registered */
|
||||
+ if (!IS_ENABLED(CONFIG_MTD_PARTITIONED_MASTER) && !mtd_is_partition(master))
|
||||
+ dev = master->dev.parent;
|
||||
+
|
||||
np = mtd_get_of_node(master);
|
||||
if (mtd_is_partition(master))
|
||||
of_node_get(np);
|
||||
@@ -593,6 +600,7 @@ static int mtd_part_of_parse(struct mtd_
|
||||
continue;
|
||||
ret = mtd_part_do_parse(parser, master, pparts, NULL);
|
||||
if (ret > 0) {
|
||||
+ of_platform_populate(np, NULL, NULL, dev);
|
||||
of_node_put(np);
|
||||
return ret;
|
||||
}
|
||||
@@ -600,6 +608,7 @@ static int mtd_part_of_parse(struct mtd_
|
||||
if (ret < 0 && !err)
|
||||
err = ret;
|
||||
}
|
||||
+ of_platform_populate(np, NULL, NULL, dev);
|
||||
of_node_put(np);
|
||||
|
||||
/*
|
@ -0,0 +1,302 @@
|
||||
From 9b78ef0c7997052e9eaa0f7a4513d546fa17358c Mon Sep 17 00:00:00 2001
|
||||
From: Mikhail Zhilkin <csharper2005@gmail.com>
|
||||
Date: Sun, 29 May 2022 11:07:14 +0000
|
||||
Subject: [PATCH] mtd: parsers: add support for Sercomm partitions
|
||||
|
||||
This adds an MTD partition parser for the Sercomm partition table that
|
||||
is used in some Beeline, Netgear and Sercomm routers.
|
||||
|
||||
The Sercomm partition map table contains real partition offsets, which
|
||||
may differ from device to device depending on the number and location of
|
||||
bad blocks on NAND.
|
||||
|
||||
Original patch (proposed by NOGUCHI Hiroshi):
|
||||
Link: https://github.com/openwrt/openwrt/pull/1318#issuecomment-420607394
|
||||
|
||||
Signed-off-by: NOGUCHI Hiroshi <drvlabo@gmail.com>
|
||||
Signed-off-by: Mikhail Zhilkin <csharper2005@gmail.com>
|
||||
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
|
||||
Link: https://lore.kernel.org/linux-mtd/20220529110714.189732-1-csharper2005@gmail.com
|
||||
---
|
||||
drivers/mtd/parsers/Kconfig | 9 ++
|
||||
drivers/mtd/parsers/Makefile | 1 +
|
||||
drivers/mtd/parsers/scpart.c | 248 +++++++++++++++++++++++++++++++++++
|
||||
3 files changed, 258 insertions(+)
|
||||
create mode 100644 drivers/mtd/parsers/scpart.c
|
||||
|
||||
--- a/drivers/mtd/parsers/Kconfig
|
||||
+++ b/drivers/mtd/parsers/Kconfig
|
||||
@@ -186,3 +186,12 @@ config MTD_QCOMSMEM_PARTS
|
||||
help
|
||||
This provides support for parsing partitions from Shared Memory (SMEM)
|
||||
for NAND and SPI flash on Qualcomm platforms.
|
||||
+
|
||||
+config MTD_SERCOMM_PARTS
|
||||
+ tristate "Sercomm partition table parser"
|
||||
+ depends on MTD && RALINK
|
||||
+ help
|
||||
+ This provides partitions table parser for devices with Sercomm
|
||||
+ partition map. This partition table contains real partition
|
||||
+ offsets, which may differ from device to device depending on the
|
||||
+ number and location of bad blocks on NAND.
|
||||
--- a/drivers/mtd/parsers/Makefile
|
||||
+++ b/drivers/mtd/parsers/Makefile
|
||||
@@ -10,6 +10,7 @@ ofpart-$(CONFIG_MTD_OF_PARTS_LINKSYS_NS)
|
||||
obj-$(CONFIG_MTD_PARSER_IMAGETAG) += parser_imagetag.o
|
||||
obj-$(CONFIG_MTD_AFS_PARTS) += afs.o
|
||||
obj-$(CONFIG_MTD_PARSER_TRX) += parser_trx.o
|
||||
+obj-$(CONFIG_MTD_SERCOMM_PARTS) += scpart.o
|
||||
obj-$(CONFIG_MTD_SHARPSL_PARTS) += sharpslpart.o
|
||||
obj-$(CONFIG_MTD_REDBOOT_PARTS) += redboot.o
|
||||
obj-$(CONFIG_MTD_QCOMSMEM_PARTS) += qcomsmempart.o
|
||||
--- /dev/null
|
||||
+++ b/drivers/mtd/parsers/scpart.c
|
||||
@@ -0,0 +1,248 @@
|
||||
+// SPDX-License-Identifier: GPL-2.0-or-later
|
||||
+/*
|
||||
+ * drivers/mtd/scpart.c: Sercomm Partition Parser
|
||||
+ *
|
||||
+ * Copyright (C) 2018 NOGUCHI Hiroshi
|
||||
+ * Copyright (C) 2022 Mikhail Zhilkin
|
||||
+ */
|
||||
+
|
||||
+#include <linux/kernel.h>
|
||||
+#include <linux/slab.h>
|
||||
+#include <linux/mtd/mtd.h>
|
||||
+#include <linux/mtd/partitions.h>
|
||||
+#include <linux/module.h>
|
||||
+
|
||||
+#define MOD_NAME "scpart"
|
||||
+
|
||||
+#ifdef pr_fmt
|
||||
+#undef pr_fmt
|
||||
+#endif
|
||||
+
|
||||
+#define pr_fmt(fmt) MOD_NAME ": " fmt
|
||||
+
|
||||
+#define ID_ALREADY_FOUND 0xffffffffUL
|
||||
+
|
||||
+#define MAP_OFFS_IN_BLK 0x800
|
||||
+#define MAP_MIRROR_NUM 2
|
||||
+
|
||||
+static const char sc_part_magic[] = {
|
||||
+ 'S', 'C', 'F', 'L', 'M', 'A', 'P', 'O', 'K', '\0',
|
||||
+};
|
||||
+#define PART_MAGIC_LEN sizeof(sc_part_magic)
|
||||
+
|
||||
+/* assumes that all fields are set by CPU native endian */
|
||||
+struct sc_part_desc {
|
||||
+ uint32_t part_id;
|
||||
+ uint32_t part_offs;
|
||||
+ uint32_t part_bytes;
|
||||
+};
|
||||
+
|
||||
+static uint32_t scpart_desc_is_valid(struct sc_part_desc *pdesc)
|
||||
+{
|
||||
+ return ((pdesc->part_id != 0xffffffffUL) &&
|
||||
+ (pdesc->part_offs != 0xffffffffUL) &&
|
||||
+ (pdesc->part_bytes != 0xffffffffUL));
|
||||
+}
|
||||
+
|
||||
+static int scpart_scan_partmap(struct mtd_info *master, loff_t partmap_offs,
|
||||
+ struct sc_part_desc **ppdesc)
|
||||
+{
|
||||
+ int cnt = 0;
|
||||
+ int res = 0;
|
||||
+ int res2;
|
||||
+ loff_t offs;
|
||||
+ size_t retlen;
|
||||
+ struct sc_part_desc *pdesc = NULL;
|
||||
+ struct sc_part_desc *tmpdesc;
|
||||
+ uint8_t *buf;
|
||||
+
|
||||
+ buf = kzalloc(master->erasesize, GFP_KERNEL);
|
||||
+ if (!buf) {
|
||||
+ res = -ENOMEM;
|
||||
+ goto out;
|
||||
+ }
|
||||
+
|
||||
+ res2 = mtd_read(master, partmap_offs, master->erasesize, &retlen, buf);
|
||||
+ if (res2 || retlen != master->erasesize) {
|
||||
+ res = -EIO;
|
||||
+ goto free;
|
||||
+ }
|
||||
+
|
||||
+ for (offs = MAP_OFFS_IN_BLK;
|
||||
+ offs < master->erasesize - sizeof(*tmpdesc);
|
||||
+ offs += sizeof(*tmpdesc)) {
|
||||
+ tmpdesc = (struct sc_part_desc *)&buf[offs];
|
||||
+ if (!scpart_desc_is_valid(tmpdesc))
|
||||
+ break;
|
||||
+ cnt++;
|
||||
+ }
|
||||
+
|
||||
+ if (cnt > 0) {
|
||||
+ int bytes = cnt * sizeof(*pdesc);
|
||||
+
|
||||
+ pdesc = kcalloc(cnt, sizeof(*pdesc), GFP_KERNEL);
|
||||
+ if (!pdesc) {
|
||||
+ res = -ENOMEM;
|
||||
+ goto free;
|
||||
+ }
|
||||
+ memcpy(pdesc, &(buf[MAP_OFFS_IN_BLK]), bytes);
|
||||
+
|
||||
+ *ppdesc = pdesc;
|
||||
+ res = cnt;
|
||||
+ }
|
||||
+
|
||||
+free:
|
||||
+ kfree(buf);
|
||||
+
|
||||
+out:
|
||||
+ return res;
|
||||
+}
|
||||
+
|
||||
+static int scpart_find_partmap(struct mtd_info *master,
|
||||
+ struct sc_part_desc **ppdesc)
|
||||
+{
|
||||
+ int magic_found = 0;
|
||||
+ int res = 0;
|
||||
+ int res2;
|
||||
+ loff_t offs = 0;
|
||||
+ size_t retlen;
|
||||
+ uint8_t rdbuf[PART_MAGIC_LEN];
|
||||
+
|
||||
+ while ((magic_found < MAP_MIRROR_NUM) &&
|
||||
+ (offs < master->size) &&
|
||||
+ !mtd_block_isbad(master, offs)) {
|
||||
+ res2 = mtd_read(master, offs, PART_MAGIC_LEN, &retlen, rdbuf);
|
||||
+ if (res2 || retlen != PART_MAGIC_LEN) {
|
||||
+ res = -EIO;
|
||||
+ goto out;
|
||||
+ }
|
||||
+ if (!memcmp(rdbuf, sc_part_magic, PART_MAGIC_LEN)) {
|
||||
+ pr_debug("Signature found at 0x%llx\n", offs);
|
||||
+ magic_found++;
|
||||
+ res = scpart_scan_partmap(master, offs, ppdesc);
|
||||
+ if (res > 0)
|
||||
+ goto out;
|
||||
+ }
|
||||
+ offs += master->erasesize;
|
||||
+ }
|
||||
+
|
||||
+out:
|
||||
+ if (res > 0)
|
||||
+ pr_info("Valid 'SC PART MAP' (%d partitions) found at 0x%llx\n", res, offs);
|
||||
+ else
|
||||
+ pr_info("No valid 'SC PART MAP' was found\n");
|
||||
+
|
||||
+ return res;
|
||||
+}
|
||||
+
|
||||
+static int scpart_parse(struct mtd_info *master,
|
||||
+ const struct mtd_partition **pparts,
|
||||
+ struct mtd_part_parser_data *data)
|
||||
+{
|
||||
+ const char *partname;
|
||||
+ int n;
|
||||
+ int nr_scparts;
|
||||
+ int nr_parts = 0;
|
||||
+ int res = 0;
|
||||
+ struct sc_part_desc *scpart_map = NULL;
|
||||
+ struct mtd_partition *parts = NULL;
|
||||
+ struct device_node *mtd_node;
|
||||
+ struct device_node *ofpart_node;
|
||||
+ struct device_node *pp;
|
||||
+
|
||||
+ mtd_node = mtd_get_of_node(master);
|
||||
+ if (!mtd_node) {
|
||||
+ res = -ENOENT;
|
||||
+ goto out;
|
||||
+ }
|
||||
+
|
||||
+ ofpart_node = of_get_child_by_name(mtd_node, "partitions");
|
||||
+ if (!ofpart_node) {
|
||||
+ pr_info("%s: 'partitions' subnode not found on %pOF.\n",
|
||||
+ master->name, mtd_node);
|
||||
+ res = -ENOENT;
|
||||
+ goto out;
|
||||
+ }
|
||||
+
|
||||
+ nr_scparts = scpart_find_partmap(master, &scpart_map);
|
||||
+ if (nr_scparts <= 0) {
|
||||
+ pr_info("No any partitions was found in 'SC PART MAP'.\n");
|
||||
+ res = -ENOENT;
|
||||
+ goto free;
|
||||
+ }
|
||||
+
|
||||
+ parts = kcalloc(of_get_child_count(ofpart_node), sizeof(*parts),
|
||||
+ GFP_KERNEL);
|
||||
+ if (!parts) {
|
||||
+ res = -ENOMEM;
|
||||
+ goto free;
|
||||
+ }
|
||||
+
|
||||
+ for_each_child_of_node(ofpart_node, pp) {
|
||||
+ u32 scpart_id;
|
||||
+
|
||||
+ if (of_property_read_u32(pp, "sercomm,scpart-id", &scpart_id))
|
||||
+ continue;
|
||||
+
|
||||
+ for (n = 0 ; n < nr_scparts ; n++)
|
||||
+ if ((scpart_map[n].part_id != ID_ALREADY_FOUND) &&
|
||||
+ (scpart_id == scpart_map[n].part_id))
|
||||
+ break;
|
||||
+ if (n >= nr_scparts)
|
||||
+ /* not match */
|
||||
+ continue;
|
||||
+
|
||||
+ /* add the partition found in OF into MTD partition array */
|
||||
+ parts[nr_parts].offset = scpart_map[n].part_offs;
|
||||
+ parts[nr_parts].size = scpart_map[n].part_bytes;
|
||||
+ parts[nr_parts].of_node = pp;
|
||||
+
|
||||
+ if (!of_property_read_string(pp, "label", &partname))
|
||||
+ parts[nr_parts].name = partname;
|
||||
+ if (of_property_read_bool(pp, "read-only"))
|
||||
+ parts[nr_parts].mask_flags |= MTD_WRITEABLE;
|
||||
+ if (of_property_read_bool(pp, "lock"))
|
||||
+ parts[nr_parts].mask_flags |= MTD_POWERUP_LOCK;
|
||||
+
|
||||
+ /* mark as 'done' */
|
||||
+ scpart_map[n].part_id = ID_ALREADY_FOUND;
|
||||
+
|
||||
+ nr_parts++;
|
||||
+ }
|
||||
+
|
||||
+ if (nr_parts > 0) {
|
||||
+ *pparts = parts;
|
||||
+ res = nr_parts;
|
||||
+ } else
|
||||
+ pr_info("No partition in OF matches partition ID with 'SC PART MAP'.\n");
|
||||
+
|
||||
+ of_node_put(pp);
|
||||
+
|
||||
+free:
|
||||
+ kfree(scpart_map);
|
||||
+ if (res <= 0)
|
||||
+ kfree(parts);
|
||||
+
|
||||
+out:
|
||||
+ return res;
|
||||
+}
|
||||
+
|
||||
+static const struct of_device_id scpart_parser_of_match_table[] = {
|
||||
+ { .compatible = "sercomm,sc-partitions" },
|
||||
+ {},
|
||||
+};
|
||||
+MODULE_DEVICE_TABLE(of, scpart_parser_of_match_table);
|
||||
+
|
||||
+static struct mtd_part_parser scpart_parser = {
|
||||
+ .parse_fn = scpart_parse,
|
||||
+ .name = "scpart",
|
||||
+ .of_match_table = scpart_parser_of_match_table,
|
||||
+};
|
||||
+module_mtd_part_parser(scpart_parser);
|
||||
+
|
||||
+/* mtd parsers will request the module by parser name */
|
||||
+MODULE_ALIAS("scpart");
|
||||
+MODULE_LICENSE("GPL");
|
||||
+MODULE_AUTHOR("NOGUCHI Hiroshi <drvlabo@gmail.com>");
|
||||
+MODULE_AUTHOR("Mikhail Zhilkin <csharper2005@gmail.com>");
|
||||
+MODULE_DESCRIPTION("Sercomm partition parser");
|
@ -0,0 +1,106 @@
|
||||
From ad9b10d1eaada169bd764abcab58f08538877e26 Mon Sep 17 00:00:00 2001
|
||||
From: Christian Marangi <ansuelsmth@gmail.com>
|
||||
Date: Wed, 22 Jun 2022 03:06:28 +0200
|
||||
Subject: mtd: core: introduce of support for dynamic partitions
|
||||
|
||||
We have many parser that register mtd partitions at runtime. One example
|
||||
is the cmdlinepart or the smem-part parser where the compatible is defined
|
||||
in the dts and the partitions gets detected and registered by the
|
||||
parser. This is problematic for the NVMEM subsystem that requires an OF
|
||||
node to detect NVMEM cells.
|
||||
|
||||
To fix this problem, introduce an additional logic that will try to
|
||||
assign an OF node to the MTD if declared.
|
||||
|
||||
On MTD addition, it will be checked if the MTD has an OF node and if
|
||||
not declared will check if a partition with the same label / node name is
|
||||
declared in DTS. If an exact match is found, the partition dynamically
|
||||
allocated by the parser will have a connected OF node.
|
||||
|
||||
The NVMEM subsystem will detect the OF node and register any NVMEM cells
|
||||
declared statically in the DTS.
|
||||
|
||||
Signed-off-by: Christian Marangi <ansuelsmth@gmail.com>
|
||||
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
|
||||
Link: https://lore.kernel.org/linux-mtd/20220622010628.30414-4-ansuelsmth@gmail.com
|
||||
---
|
||||
drivers/mtd/mtdcore.c | 61 +++++++++++++++++++++++++++++++++++++++++++
|
||||
1 file changed, 61 insertions(+)
|
||||
|
||||
--- a/drivers/mtd/mtdcore.c
|
||||
+++ b/drivers/mtd/mtdcore.c
|
||||
@@ -564,6 +564,66 @@ static int mtd_nvmem_add(struct mtd_info
|
||||
return 0;
|
||||
}
|
||||
|
||||
+static void mtd_check_of_node(struct mtd_info *mtd)
|
||||
+{
|
||||
+ struct device_node *partitions, *parent_dn, *mtd_dn = NULL;
|
||||
+ const char *pname, *prefix = "partition-";
|
||||
+ int plen, mtd_name_len, offset, prefix_len;
|
||||
+ struct mtd_info *parent;
|
||||
+ bool found = false;
|
||||
+
|
||||
+ /* Check if MTD already has a device node */
|
||||
+ if (dev_of_node(&mtd->dev))
|
||||
+ return;
|
||||
+
|
||||
+ /* Check if a partitions node exist */
|
||||
+ parent = mtd->parent;
|
||||
+ parent_dn = dev_of_node(&parent->dev);
|
||||
+ if (!parent_dn)
|
||||
+ return;
|
||||
+
|
||||
+ partitions = of_get_child_by_name(parent_dn, "partitions");
|
||||
+ if (!partitions)
|
||||
+ goto exit_parent;
|
||||
+
|
||||
+ prefix_len = strlen(prefix);
|
||||
+ mtd_name_len = strlen(mtd->name);
|
||||
+
|
||||
+ /* Search if a partition is defined with the same name */
|
||||
+ for_each_child_of_node(partitions, mtd_dn) {
|
||||
+ offset = 0;
|
||||
+
|
||||
+ /* Skip partition with no/wrong prefix */
|
||||
+ if (!of_node_name_prefix(mtd_dn, "partition-"))
|
||||
+ continue;
|
||||
+
|
||||
+ /* Label have priority. Check that first */
|
||||
+ if (of_property_read_string(mtd_dn, "label", &pname)) {
|
||||
+ of_property_read_string(mtd_dn, "name", &pname);
|
||||
+ offset = prefix_len;
|
||||
+ }
|
||||
+
|
||||
+ plen = strlen(pname) - offset;
|
||||
+ if (plen == mtd_name_len &&
|
||||
+ !strncmp(mtd->name, pname + offset, plen)) {
|
||||
+ found = true;
|
||||
+ break;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ if (!found)
|
||||
+ goto exit_partitions;
|
||||
+
|
||||
+ /* Set of_node only for nvmem */
|
||||
+ if (of_device_is_compatible(mtd_dn, "nvmem-cells"))
|
||||
+ mtd_set_of_node(mtd, mtd_dn);
|
||||
+
|
||||
+exit_partitions:
|
||||
+ of_node_put(partitions);
|
||||
+exit_parent:
|
||||
+ of_node_put(parent_dn);
|
||||
+}
|
||||
+
|
||||
/**
|
||||
* add_mtd_device - register an MTD device
|
||||
* @mtd: pointer to new MTD device info structure
|
||||
@@ -669,6 +729,7 @@ int add_mtd_device(struct mtd_info *mtd)
|
||||
mtd->dev.devt = MTD_DEVT(i);
|
||||
dev_set_name(&mtd->dev, "mtd%d", i);
|
||||
dev_set_drvdata(&mtd->dev, mtd);
|
||||
+ mtd_check_of_node(mtd);
|
||||
of_node_get(mtd_get_of_node(mtd));
|
||||
error = device_register(&mtd->dev);
|
||||
if (error) {
|
@ -0,0 +1,72 @@
|
||||
From b0321721be50b80c03a51866a94fde4f94690e18 Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= <rafal@milecki.pl>
|
||||
Date: Wed, 15 Jun 2022 21:42:59 +0200
|
||||
Subject: [PATCH] mtd: allow getting MTD device associated with a specific DT
|
||||
node
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
MTD subsystem API allows interacting with MTD devices (e.g. reading,
|
||||
writing, handling bad blocks). So far a random driver could get MTD
|
||||
device only by its name (get_mtd_device_nm()). This change allows
|
||||
getting them also by a DT node.
|
||||
|
||||
This API is required for drivers handling DT defined MTD partitions in a
|
||||
specific way (e.g. U-Boot (sub)partition with environment variables).
|
||||
|
||||
Signed-off-by: Rafał Miłecki <rafal@milecki.pl>
|
||||
Acked-by: Miquel Raynal <miquel.raynal@bootlin.com>
|
||||
Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
|
||||
---
|
||||
drivers/mtd/mtdcore.c | 28 ++++++++++++++++++++++++++++
|
||||
include/linux/mtd/mtd.h | 1 +
|
||||
2 files changed, 29 insertions(+)
|
||||
|
||||
--- a/drivers/mtd/mtdcore.c
|
||||
+++ b/drivers/mtd/mtdcore.c
|
||||
@@ -1236,6 +1236,34 @@ int __get_mtd_device(struct mtd_info *mt
|
||||
EXPORT_SYMBOL_GPL(__get_mtd_device);
|
||||
|
||||
/**
|
||||
+ * of_get_mtd_device_by_node - obtain an MTD device associated with a given node
|
||||
+ *
|
||||
+ * @np: device tree node
|
||||
+ */
|
||||
+struct mtd_info *of_get_mtd_device_by_node(struct device_node *np)
|
||||
+{
|
||||
+ struct mtd_info *mtd = NULL;
|
||||
+ struct mtd_info *tmp;
|
||||
+ int err;
|
||||
+
|
||||
+ mutex_lock(&mtd_table_mutex);
|
||||
+
|
||||
+ err = -EPROBE_DEFER;
|
||||
+ mtd_for_each_device(tmp) {
|
||||
+ if (mtd_get_of_node(tmp) == np) {
|
||||
+ mtd = tmp;
|
||||
+ err = __get_mtd_device(mtd);
|
||||
+ break;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ mutex_unlock(&mtd_table_mutex);
|
||||
+
|
||||
+ return err ? ERR_PTR(err) : mtd;
|
||||
+}
|
||||
+EXPORT_SYMBOL_GPL(of_get_mtd_device_by_node);
|
||||
+
|
||||
+/**
|
||||
* get_mtd_device_nm - obtain a validated handle for an MTD device by
|
||||
* device name
|
||||
* @name: MTD device name to open
|
||||
--- a/include/linux/mtd/mtd.h
|
||||
+++ b/include/linux/mtd/mtd.h
|
||||
@@ -682,6 +682,7 @@ extern int mtd_device_unregister(struct
|
||||
extern struct mtd_info *get_mtd_device(struct mtd_info *mtd, int num);
|
||||
extern int __get_mtd_device(struct mtd_info *mtd);
|
||||
extern void __put_mtd_device(struct mtd_info *mtd);
|
||||
+extern struct mtd_info *of_get_mtd_device_by_node(struct device_node *np);
|
||||
extern struct mtd_info *get_mtd_device_nm(const char *name);
|
||||
extern void put_mtd_device(struct mtd_info *mtd);
|
||||
|
@ -0,0 +1,30 @@
|
||||
From 7ec4cdb321738d44ae5d405e7b6ac73dfbf99caa Mon Sep 17 00:00:00 2001
|
||||
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
|
||||
Date: Mon, 25 Jul 2022 22:49:25 +0900
|
||||
Subject: [PATCH] mtd: core: check partition before dereference
|
||||
|
||||
syzbot is reporting NULL pointer dereference at mtd_check_of_node() [1],
|
||||
for mtdram test device (CONFIG_MTD_MTDRAM) is not partition.
|
||||
|
||||
Link: https://syzkaller.appspot.com/bug?extid=fe013f55a2814a9e8cfd [1]
|
||||
Reported-by: syzbot <syzbot+fe013f55a2814a9e8cfd@syzkaller.appspotmail.com>
|
||||
Reported-by: kernel test robot <oliver.sang@intel.com>
|
||||
Fixes: ad9b10d1eaada169 ("mtd: core: introduce of support for dynamic partitions")
|
||||
Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
|
||||
CC: stable@vger.kernel.org
|
||||
Signed-off-by: Richard Weinberger <richard@nod.at>
|
||||
---
|
||||
drivers/mtd/mtdcore.c | 2 ++
|
||||
1 file changed, 2 insertions(+)
|
||||
|
||||
--- a/drivers/mtd/mtdcore.c
|
||||
+++ b/drivers/mtd/mtdcore.c
|
||||
@@ -577,6 +577,8 @@ static void mtd_check_of_node(struct mtd
|
||||
return;
|
||||
|
||||
/* Check if a partitions node exist */
|
||||
+ if (!mtd_is_partition(mtd))
|
||||
+ return;
|
||||
parent = mtd->parent;
|
||||
parent_dn = dev_of_node(&parent->dev);
|
||||
if (!parent_dn)
|
@ -0,0 +1,101 @@
|
||||
From 12b58961de0bd88b3c7dfa5d21f6d67f4678b780 Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= <rafal@milecki.pl>
|
||||
Date: Tue, 18 Oct 2022 07:18:22 +0200
|
||||
Subject: [PATCH] mtd: core: add missing of_node_get() in dynamic partitions
|
||||
code
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
This fixes unbalanced of_node_put():
|
||||
[ 1.078910] 6 cmdlinepart partitions found on MTD device gpmi-nand
|
||||
[ 1.085116] Creating 6 MTD partitions on "gpmi-nand":
|
||||
[ 1.090181] 0x000000000000-0x000008000000 : "nandboot"
|
||||
[ 1.096952] 0x000008000000-0x000009000000 : "nandfit"
|
||||
[ 1.103547] 0x000009000000-0x00000b000000 : "nandkernel"
|
||||
[ 1.110317] 0x00000b000000-0x00000c000000 : "nanddtb"
|
||||
[ 1.115525] ------------[ cut here ]------------
|
||||
[ 1.120141] refcount_t: addition on 0; use-after-free.
|
||||
[ 1.125328] WARNING: CPU: 0 PID: 1 at lib/refcount.c:25 refcount_warn_saturate+0xdc/0x148
|
||||
[ 1.133528] Modules linked in:
|
||||
[ 1.136589] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 6.0.0-rc7-next-20220930-04543-g8cf3f7
|
||||
[ 1.146342] Hardware name: Freescale i.MX8DXL DDR3L EVK (DT)
|
||||
[ 1.151999] pstate: 600000c5 (nZCv daIF -PAN -UAO -TCO -DIT -SSBS BTYPE=--)
|
||||
[ 1.158965] pc : refcount_warn_saturate+0xdc/0x148
|
||||
[ 1.163760] lr : refcount_warn_saturate+0xdc/0x148
|
||||
[ 1.168556] sp : ffff800009ddb080
|
||||
[ 1.171866] x29: ffff800009ddb080 x28: ffff800009ddb35a x27: 0000000000000002
|
||||
[ 1.179015] x26: ffff8000098b06ad x25: ffffffffffffffff x24: ffff0a00ffffff05
|
||||
[ 1.186165] x23: ffff00001fdf6470 x22: ffff800009ddb367 x21: 0000000000000000
|
||||
[ 1.193314] x20: ffff00001fdfebe8 x19: ffff00001fdfec50 x18: ffffffffffffffff
|
||||
[ 1.200464] x17: 0000000000000000 x16: 0000000000000118 x15: 0000000000000004
|
||||
[ 1.207614] x14: 0000000000000fff x13: ffff800009bca248 x12: 0000000000000003
|
||||
[ 1.214764] x11: 00000000ffffefff x10: c0000000ffffefff x9 : 4762cb2ccb52de00
|
||||
[ 1.221914] x8 : 4762cb2ccb52de00 x7 : 205d313431303231 x6 : 312e31202020205b
|
||||
[ 1.229063] x5 : ffff800009d55c1f x4 : 0000000000000001 x3 : 0000000000000000
|
||||
[ 1.236213] x2 : 0000000000000000 x1 : ffff800009954be6 x0 : 000000000000002a
|
||||
[ 1.243365] Call trace:
|
||||
[ 1.245806] refcount_warn_saturate+0xdc/0x148
|
||||
[ 1.250253] kobject_get+0x98/0x9c
|
||||
[ 1.253658] of_node_get+0x20/0x34
|
||||
[ 1.257072] of_fwnode_get+0x3c/0x54
|
||||
[ 1.260652] fwnode_get_nth_parent+0xd8/0xf4
|
||||
[ 1.264926] fwnode_full_name_string+0x3c/0xb4
|
||||
[ 1.269373] device_node_string+0x498/0x5b4
|
||||
[ 1.273561] pointer+0x41c/0x5d0
|
||||
[ 1.276793] vsnprintf+0x4d8/0x694
|
||||
[ 1.280198] vprintk_store+0x164/0x528
|
||||
[ 1.283951] vprintk_emit+0x98/0x164
|
||||
[ 1.287530] vprintk_default+0x44/0x6c
|
||||
[ 1.291284] vprintk+0xf0/0x134
|
||||
[ 1.294428] _printk+0x54/0x7c
|
||||
[ 1.297486] of_node_release+0xe8/0x128
|
||||
[ 1.301326] kobject_put+0x98/0xfc
|
||||
[ 1.304732] of_node_put+0x1c/0x28
|
||||
[ 1.308137] add_mtd_device+0x484/0x6d4
|
||||
[ 1.311977] add_mtd_partitions+0xf0/0x1d0
|
||||
[ 1.316078] parse_mtd_partitions+0x45c/0x518
|
||||
[ 1.320439] mtd_device_parse_register+0xb0/0x274
|
||||
[ 1.325147] gpmi_nand_probe+0x51c/0x650
|
||||
[ 1.329074] platform_probe+0xa8/0xd0
|
||||
[ 1.332740] really_probe+0x130/0x334
|
||||
[ 1.336406] __driver_probe_device+0xb4/0xe0
|
||||
[ 1.340681] driver_probe_device+0x3c/0x1f8
|
||||
[ 1.344869] __driver_attach+0xdc/0x1a4
|
||||
[ 1.348708] bus_for_each_dev+0x80/0xcc
|
||||
[ 1.352548] driver_attach+0x24/0x30
|
||||
[ 1.356127] bus_add_driver+0x108/0x1f4
|
||||
[ 1.359967] driver_register+0x78/0x114
|
||||
[ 1.363807] __platform_driver_register+0x24/0x30
|
||||
[ 1.368515] gpmi_nand_driver_init+0x1c/0x28
|
||||
[ 1.372798] do_one_initcall+0xbc/0x238
|
||||
[ 1.376638] do_initcall_level+0x94/0xb4
|
||||
[ 1.380565] do_initcalls+0x54/0x94
|
||||
[ 1.384058] do_basic_setup+0x1c/0x28
|
||||
[ 1.387724] kernel_init_freeable+0x110/0x188
|
||||
[ 1.392084] kernel_init+0x20/0x1a0
|
||||
[ 1.395578] ret_from_fork+0x10/0x20
|
||||
[ 1.399157] ---[ end trace 0000000000000000 ]---
|
||||
[ 1.403782] ------------[ cut here ]------------
|
||||
|
||||
Reported-by: Han Xu <han.xu@nxp.com>
|
||||
Fixes: ad9b10d1eaada169 ("mtd: core: introduce of support for dynamic partitions")
|
||||
Signed-off-by: Rafał Miłecki <rafal@milecki.pl>
|
||||
Tested-by: Han Xu <han.xu@nxp.com>
|
||||
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
|
||||
Link: https://lore.kernel.org/linux-mtd/20221018051822.28685-1-zajec5@gmail.com
|
||||
---
|
||||
drivers/mtd/mtdcore.c | 2 +-
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
--- a/drivers/mtd/mtdcore.c
|
||||
+++ b/drivers/mtd/mtdcore.c
|
||||
@@ -580,7 +580,7 @@ static void mtd_check_of_node(struct mtd
|
||||
if (!mtd_is_partition(mtd))
|
||||
return;
|
||||
parent = mtd->parent;
|
||||
- parent_dn = dev_of_node(&parent->dev);
|
||||
+ parent_dn = of_node_get(dev_of_node(&parent->dev));
|
||||
if (!parent_dn)
|
||||
return;
|
||||
|
@ -0,0 +1,65 @@
|
||||
From 63db0cb35e1cb3b3c134906d1062f65513fdda2d Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= <rafal@milecki.pl>
|
||||
Date: Tue, 4 Oct 2022 10:37:09 +0200
|
||||
Subject: [PATCH] mtd: core: simplify (a bit) code find partition-matching
|
||||
dynamic OF node
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
1. Don't hardcode "partition-" string twice
|
||||
2. Use simpler logic & use ->name to avoid of_property_read_string()
|
||||
3. Use mtd_get_of_node() helper
|
||||
|
||||
Cc: Christian Marangi <ansuelsmth@gmail.com>
|
||||
Signed-off-by: Rafał Miłecki <rafal@milecki.pl>
|
||||
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
|
||||
Link: https://lore.kernel.org/linux-mtd/20221004083710.27704-1-zajec5@gmail.com
|
||||
---
|
||||
drivers/mtd/mtdcore.c | 16 +++++++---------
|
||||
1 file changed, 7 insertions(+), 9 deletions(-)
|
||||
|
||||
--- a/drivers/mtd/mtdcore.c
|
||||
+++ b/drivers/mtd/mtdcore.c
|
||||
@@ -569,18 +569,16 @@ static void mtd_check_of_node(struct mtd
|
||||
struct device_node *partitions, *parent_dn, *mtd_dn = NULL;
|
||||
const char *pname, *prefix = "partition-";
|
||||
int plen, mtd_name_len, offset, prefix_len;
|
||||
- struct mtd_info *parent;
|
||||
bool found = false;
|
||||
|
||||
/* Check if MTD already has a device node */
|
||||
- if (dev_of_node(&mtd->dev))
|
||||
+ if (mtd_get_of_node(mtd))
|
||||
return;
|
||||
|
||||
/* Check if a partitions node exist */
|
||||
if (!mtd_is_partition(mtd))
|
||||
return;
|
||||
- parent = mtd->parent;
|
||||
- parent_dn = of_node_get(dev_of_node(&parent->dev));
|
||||
+ parent_dn = of_node_get(mtd_get_of_node(mtd->parent));
|
||||
if (!parent_dn)
|
||||
return;
|
||||
|
||||
@@ -593,15 +591,15 @@ static void mtd_check_of_node(struct mtd
|
||||
|
||||
/* Search if a partition is defined with the same name */
|
||||
for_each_child_of_node(partitions, mtd_dn) {
|
||||
- offset = 0;
|
||||
-
|
||||
/* Skip partition with no/wrong prefix */
|
||||
- if (!of_node_name_prefix(mtd_dn, "partition-"))
|
||||
+ if (!of_node_name_prefix(mtd_dn, prefix))
|
||||
continue;
|
||||
|
||||
/* Label have priority. Check that first */
|
||||
- if (of_property_read_string(mtd_dn, "label", &pname)) {
|
||||
- of_property_read_string(mtd_dn, "name", &pname);
|
||||
+ if (!of_property_read_string(mtd_dn, "label", &pname)) {
|
||||
+ offset = 0;
|
||||
+ } else {
|
||||
+ pname = mtd_dn->name;
|
||||
offset = prefix_len;
|
||||
}
|
||||
|
@ -0,0 +1,84 @@
|
||||
From ddb8cefb7af288950447ca6eeeafb09977dab56f Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= <rafal@milecki.pl>
|
||||
Date: Tue, 4 Oct 2022 10:37:10 +0200
|
||||
Subject: [PATCH] mtd: core: try to find OF node for every MTD partition
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
So far this feature was limited to the top-level "nvmem-cells" node.
|
||||
There are multiple parsers creating partitions and subpartitions
|
||||
dynamically. Extend that code to handle them too.
|
||||
|
||||
This allows finding partition-* node for every MTD (sub)partition.
|
||||
|
||||
Random example:
|
||||
|
||||
partitions {
|
||||
compatible = "brcm,bcm947xx-cfe-partitions";
|
||||
|
||||
partition-firmware {
|
||||
compatible = "brcm,trx";
|
||||
|
||||
partition-loader {
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
Cc: Christian Marangi <ansuelsmth@gmail.com>
|
||||
Signed-off-by: Rafał Miłecki <rafal@milecki.pl>
|
||||
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
|
||||
Link: https://lore.kernel.org/linux-mtd/20221004083710.27704-2-zajec5@gmail.com
|
||||
---
|
||||
drivers/mtd/mtdcore.c | 18 ++++++------------
|
||||
1 file changed, 6 insertions(+), 12 deletions(-)
|
||||
|
||||
--- a/drivers/mtd/mtdcore.c
|
||||
+++ b/drivers/mtd/mtdcore.c
|
||||
@@ -569,20 +569,22 @@ static void mtd_check_of_node(struct mtd
|
||||
struct device_node *partitions, *parent_dn, *mtd_dn = NULL;
|
||||
const char *pname, *prefix = "partition-";
|
||||
int plen, mtd_name_len, offset, prefix_len;
|
||||
- bool found = false;
|
||||
|
||||
/* Check if MTD already has a device node */
|
||||
if (mtd_get_of_node(mtd))
|
||||
return;
|
||||
|
||||
- /* Check if a partitions node exist */
|
||||
if (!mtd_is_partition(mtd))
|
||||
return;
|
||||
+
|
||||
parent_dn = of_node_get(mtd_get_of_node(mtd->parent));
|
||||
if (!parent_dn)
|
||||
return;
|
||||
|
||||
- partitions = of_get_child_by_name(parent_dn, "partitions");
|
||||
+ if (mtd_is_partition(mtd->parent))
|
||||
+ partitions = of_node_get(parent_dn);
|
||||
+ else
|
||||
+ partitions = of_get_child_by_name(parent_dn, "partitions");
|
||||
if (!partitions)
|
||||
goto exit_parent;
|
||||
|
||||
@@ -606,19 +608,11 @@ static void mtd_check_of_node(struct mtd
|
||||
plen = strlen(pname) - offset;
|
||||
if (plen == mtd_name_len &&
|
||||
!strncmp(mtd->name, pname + offset, plen)) {
|
||||
- found = true;
|
||||
+ mtd_set_of_node(mtd, mtd_dn);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
- if (!found)
|
||||
- goto exit_partitions;
|
||||
-
|
||||
- /* Set of_node only for nvmem */
|
||||
- if (of_device_is_compatible(mtd_dn, "nvmem-cells"))
|
||||
- mtd_set_of_node(mtd, mtd_dn);
|
||||
-
|
||||
-exit_partitions:
|
||||
of_node_put(partitions);
|
||||
exit_parent:
|
||||
of_node_put(parent_dn);
|
@ -0,0 +1,32 @@
|
||||
From 26bccc9671ba5e01f7153addbe94e7dc3f677375 Mon Sep 17 00:00:00 2001
|
||||
From: Bryan O'Donoghue <bryan.odonoghue@linaro.org>
|
||||
Date: Mon, 3 Jan 2022 03:03:16 +0000
|
||||
Subject: [PATCH 13/14] mtd: parsers: qcom: Don't print error message on
|
||||
-EPROBE_DEFER
|
||||
|
||||
Its possible for the main smem driver to not be loaded by the time we come
|
||||
along to parse the smem partition description but, this is a perfectly
|
||||
normal thing.
|
||||
|
||||
No need to print out an error message in this case.
|
||||
|
||||
Signed-off-by: Bryan O'Donoghue <bryan.odonoghue@linaro.org>
|
||||
Reviewed-by: Manivannan Sadhasivam <mani@kernel.org>
|
||||
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
|
||||
Link: https://lore.kernel.org/linux-mtd/20220103030316.58301-3-bryan.odonoghue@linaro.org
|
||||
---
|
||||
drivers/mtd/parsers/qcomsmempart.c | 3 ++-
|
||||
1 file changed, 2 insertions(+), 1 deletion(-)
|
||||
|
||||
--- a/drivers/mtd/parsers/qcomsmempart.c
|
||||
+++ b/drivers/mtd/parsers/qcomsmempart.c
|
||||
@@ -75,7 +75,8 @@ static int parse_qcomsmem_part(struct mt
|
||||
pr_debug("Parsing partition table info from SMEM\n");
|
||||
ptable = qcom_smem_get(SMEM_APPS, SMEM_AARM_PARTITION_TABLE, &len);
|
||||
if (IS_ERR(ptable)) {
|
||||
- pr_err("Error reading partition table header\n");
|
||||
+ if (PTR_ERR(ptable) != -EPROBE_DEFER)
|
||||
+ pr_err("Error reading partition table header\n");
|
||||
return PTR_ERR(ptable);
|
||||
}
|
||||
|
@ -0,0 +1,47 @@
|
||||
From 26422ac78e9d8767bd4aabfbae616b15edbf6a1b Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= <rafal@milecki.pl>
|
||||
Date: Sat, 22 Oct 2022 23:13:18 +0200
|
||||
Subject: [PATCH] mtd: core: set ROOT_DEV for partitions marked as rootfs in DT
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
This adds support for "linux,rootfs" binding that is used to mark flash
|
||||
partition containing rootfs. It's useful for devices using device tree
|
||||
that don't have bootloader passing root info in cmdline.
|
||||
|
||||
Signed-off-by: Rafał Miłecki <rafal@milecki.pl>
|
||||
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
|
||||
Link: https://lore.kernel.org/linux-mtd/20221022211318.32009-2-zajec5@gmail.com
|
||||
---
|
||||
drivers/mtd/mtdcore.c | 12 ++++++++++++
|
||||
1 file changed, 12 insertions(+)
|
||||
|
||||
--- a/drivers/mtd/mtdcore.c
|
||||
+++ b/drivers/mtd/mtdcore.c
|
||||
@@ -28,6 +28,7 @@
|
||||
#include <linux/leds.h>
|
||||
#include <linux/debugfs.h>
|
||||
#include <linux/nvmem-provider.h>
|
||||
+#include <linux/root_dev.h>
|
||||
|
||||
#include <linux/mtd/mtd.h>
|
||||
#include <linux/mtd/partitions.h>
|
||||
@@ -748,6 +749,17 @@ int add_mtd_device(struct mtd_info *mtd)
|
||||
not->add(mtd);
|
||||
|
||||
mutex_unlock(&mtd_table_mutex);
|
||||
+
|
||||
+ if (of_find_property(mtd_get_of_node(mtd), "linux,rootfs", NULL)) {
|
||||
+ if (IS_BUILTIN(CONFIG_MTD)) {
|
||||
+ pr_info("mtd: setting mtd%d (%s) as root device\n", mtd->index, mtd->name);
|
||||
+ ROOT_DEV = MKDEV(MTD_BLOCK_MAJOR, mtd->index);
|
||||
+ } else {
|
||||
+ pr_warn("mtd: can't set mtd%d (%s) as root device - mtd must be builtin\n",
|
||||
+ mtd->index, mtd->name);
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
/* We _know_ we aren't being removed, because
|
||||
our caller is still holding us here. So none
|
||||
of this try_ nonsense, and no bitching about it
|
@ -0,0 +1,33 @@
|
||||
From 2365f91c861cbfeef7141c69842848c7b2d3c2db Mon Sep 17 00:00:00 2001
|
||||
From: INAGAKI Hiroshi <musashino.open@gmail.com>
|
||||
Date: Sun, 13 Feb 2022 15:40:44 +0900
|
||||
Subject: [PATCH] mtd: parsers: trx: allow to use on MediaTek MIPS SoCs
|
||||
|
||||
Buffalo sells some router devices which have trx-formatted firmware,
|
||||
based on MediaTek MIPS SoCs. To use parser_trx on those devices, add
|
||||
"RALINK" to dependency and allow to compile for MediaTek MIPS SoCs.
|
||||
|
||||
examples:
|
||||
|
||||
- WCR-1166DS (MT7628)
|
||||
- WSR-1166DHP (MT7621)
|
||||
- WSR-2533DHP (MT7621)
|
||||
|
||||
Signed-off-by: INAGAKI Hiroshi <musashino.open@gmail.com>
|
||||
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
|
||||
Link: https://lore.kernel.org/linux-mtd/20220213064045.1781-1-musashino.open@gmail.com
|
||||
---
|
||||
drivers/mtd/parsers/Kconfig | 2 +-
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
--- a/drivers/mtd/parsers/Kconfig
|
||||
+++ b/drivers/mtd/parsers/Kconfig
|
||||
@@ -115,7 +115,7 @@ config MTD_AFS_PARTS
|
||||
|
||||
config MTD_PARSER_TRX
|
||||
tristate "Parser for TRX format partitions"
|
||||
- depends on MTD && (BCM47XX || ARCH_BCM_5301X || ARCH_MEDIATEK || COMPILE_TEST)
|
||||
+ depends on MTD && (BCM47XX || ARCH_BCM_5301X || ARCH_MEDIATEK || RALINK || COMPILE_TEST)
|
||||
help
|
||||
TRX is a firmware format used by Broadcom on their devices. It
|
||||
may contain up to 3/4 partitions (depending on the version).
|
@ -0,0 +1,58 @@
|
||||
From 573eec222bc82fb5e724586267fbbb1aed9ffd03 Mon Sep 17 00:00:00 2001
|
||||
From: Chuanhong Guo <gch981213@gmail.com>
|
||||
Date: Sun, 20 Mar 2022 17:59:58 +0800
|
||||
Subject: [PATCH 2/5] mtd: spinand: gigadevice: add support for GD5FxGQ4xExxG
|
||||
|
||||
Add support for:
|
||||
GD5F1GQ4RExxG
|
||||
GD5F2GQ4{U,R}ExxG
|
||||
|
||||
These chips differ from GD5F1GQ4UExxG only in chip ID, voltage
|
||||
and capacity.
|
||||
|
||||
Signed-off-by: Chuanhong Guo <gch981213@gmail.com>
|
||||
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
|
||||
Link: https://lore.kernel.org/linux-mtd/20220320100001.247905-3-gch981213@gmail.com
|
||||
---
|
||||
drivers/mtd/nand/spi/gigadevice.c | 30 ++++++++++++++++++++++++++++++
|
||||
1 file changed, 30 insertions(+)
|
||||
|
||||
--- a/drivers/mtd/nand/spi/gigadevice.c
|
||||
+++ b/drivers/mtd/nand/spi/gigadevice.c
|
||||
@@ -333,6 +333,36 @@ static const struct spinand_info gigadev
|
||||
SPINAND_HAS_QE_BIT,
|
||||
SPINAND_ECCINFO(&gd5fxgqx_variant2_ooblayout,
|
||||
gd5fxgq4uexxg_ecc_get_status)),
|
||||
+ SPINAND_INFO("GD5F1GQ4RExxG",
|
||||
+ SPINAND_ID(SPINAND_READID_METHOD_OPCODE_ADDR, 0xc1),
|
||||
+ NAND_MEMORG(1, 2048, 128, 64, 1024, 20, 1, 1, 1),
|
||||
+ NAND_ECCREQ(8, 512),
|
||||
+ SPINAND_INFO_OP_VARIANTS(&read_cache_variants,
|
||||
+ &write_cache_variants,
|
||||
+ &update_cache_variants),
|
||||
+ SPINAND_HAS_QE_BIT,
|
||||
+ SPINAND_ECCINFO(&gd5fxgqx_variant2_ooblayout,
|
||||
+ gd5fxgq4uexxg_ecc_get_status)),
|
||||
+ SPINAND_INFO("GD5F2GQ4UExxG",
|
||||
+ SPINAND_ID(SPINAND_READID_METHOD_OPCODE_ADDR, 0xd2),
|
||||
+ NAND_MEMORG(1, 2048, 128, 64, 2048, 40, 1, 1, 1),
|
||||
+ NAND_ECCREQ(8, 512),
|
||||
+ SPINAND_INFO_OP_VARIANTS(&read_cache_variants,
|
||||
+ &write_cache_variants,
|
||||
+ &update_cache_variants),
|
||||
+ SPINAND_HAS_QE_BIT,
|
||||
+ SPINAND_ECCINFO(&gd5fxgqx_variant2_ooblayout,
|
||||
+ gd5fxgq4uexxg_ecc_get_status)),
|
||||
+ SPINAND_INFO("GD5F2GQ4RExxG",
|
||||
+ SPINAND_ID(SPINAND_READID_METHOD_OPCODE_ADDR, 0xc2),
|
||||
+ NAND_MEMORG(1, 2048, 128, 64, 2048, 40, 1, 1, 1),
|
||||
+ NAND_ECCREQ(8, 512),
|
||||
+ SPINAND_INFO_OP_VARIANTS(&read_cache_variants,
|
||||
+ &write_cache_variants,
|
||||
+ &update_cache_variants),
|
||||
+ SPINAND_HAS_QE_BIT,
|
||||
+ SPINAND_ECCINFO(&gd5fxgqx_variant2_ooblayout,
|
||||
+ gd5fxgq4uexxg_ecc_get_status)),
|
||||
SPINAND_INFO("GD5F1GQ4UFxxG",
|
||||
SPINAND_ID(SPINAND_READID_METHOD_OPCODE, 0xb1, 0x48),
|
||||
NAND_MEMORG(1, 2048, 128, 64, 1024, 20, 1, 1, 1),
|
@ -0,0 +1,33 @@
|
||||
From 620a988813403318023296b61228ee8f3fcdb8e0 Mon Sep 17 00:00:00 2001
|
||||
From: Chuanhong Guo <gch981213@gmail.com>
|
||||
Date: Sun, 20 Mar 2022 17:59:59 +0800
|
||||
Subject: [PATCH 3/5] mtd: spinand: gigadevice: add support for GD5F1GQ5RExxG
|
||||
|
||||
This chip is the 1.8v version of GD5F1GQ5UExxG.
|
||||
|
||||
Signed-off-by: Chuanhong Guo <gch981213@gmail.com>
|
||||
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
|
||||
Link: https://lore.kernel.org/linux-mtd/20220320100001.247905-4-gch981213@gmail.com
|
||||
---
|
||||
drivers/mtd/nand/spi/gigadevice.c | 10 ++++++++++
|
||||
1 file changed, 10 insertions(+)
|
||||
|
||||
--- a/drivers/mtd/nand/spi/gigadevice.c
|
||||
+++ b/drivers/mtd/nand/spi/gigadevice.c
|
||||
@@ -383,6 +383,16 @@ static const struct spinand_info gigadev
|
||||
SPINAND_HAS_QE_BIT,
|
||||
SPINAND_ECCINFO(&gd5fxgqx_variant2_ooblayout,
|
||||
gd5fxgq5xexxg_ecc_get_status)),
|
||||
+ SPINAND_INFO("GD5F1GQ5RExxG",
|
||||
+ SPINAND_ID(SPINAND_READID_METHOD_OPCODE_DUMMY, 0x41),
|
||||
+ NAND_MEMORG(1, 2048, 128, 64, 1024, 20, 1, 1, 1),
|
||||
+ NAND_ECCREQ(4, 512),
|
||||
+ SPINAND_INFO_OP_VARIANTS(&read_cache_variants_1gq5,
|
||||
+ &write_cache_variants,
|
||||
+ &update_cache_variants),
|
||||
+ SPINAND_HAS_QE_BIT,
|
||||
+ SPINAND_ECCINFO(&gd5fxgqx_variant2_ooblayout,
|
||||
+ gd5fxgq5xexxg_ecc_get_status)),
|
||||
};
|
||||
|
||||
static const struct spinand_manufacturer_ops gigadevice_spinand_manuf_ops = {
|
@ -0,0 +1,84 @@
|
||||
From 194ec04b3a9e7fa97d1fbef296410631bc3cf1c8 Mon Sep 17 00:00:00 2001
|
||||
From: Chuanhong Guo <gch981213@gmail.com>
|
||||
Date: Sun, 20 Mar 2022 18:00:00 +0800
|
||||
Subject: [PATCH 4/5] mtd: spinand: gigadevice: add support for GD5F{2,
|
||||
4}GQ5xExxG
|
||||
|
||||
Add support for:
|
||||
GD5F2GQ5{U,R}ExxG
|
||||
GD5F4GQ6{U,R}ExxG
|
||||
|
||||
These chips uses 4 dummy bytes for quad io and 2 dummy bytes for dual io.
|
||||
Besides that and memory layout, they are identical to their 1G variant.
|
||||
|
||||
Signed-off-by: Chuanhong Guo <gch981213@gmail.com>
|
||||
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
|
||||
Link: https://lore.kernel.org/linux-mtd/20220320100001.247905-5-gch981213@gmail.com
|
||||
---
|
||||
drivers/mtd/nand/spi/gigadevice.c | 48 +++++++++++++++++++++++++++++++
|
||||
1 file changed, 48 insertions(+)
|
||||
|
||||
--- a/drivers/mtd/nand/spi/gigadevice.c
|
||||
+++ b/drivers/mtd/nand/spi/gigadevice.c
|
||||
@@ -47,6 +47,14 @@ static SPINAND_OP_VARIANTS(read_cache_va
|
||||
SPINAND_PAGE_READ_FROM_CACHE_OP(true, 0, 1, NULL, 0),
|
||||
SPINAND_PAGE_READ_FROM_CACHE_OP(false, 0, 1, NULL, 0));
|
||||
|
||||
+static SPINAND_OP_VARIANTS(read_cache_variants_2gq5,
|
||||
+ SPINAND_PAGE_READ_FROM_CACHE_QUADIO_OP(0, 4, NULL, 0),
|
||||
+ SPINAND_PAGE_READ_FROM_CACHE_X4_OP(0, 1, NULL, 0),
|
||||
+ SPINAND_PAGE_READ_FROM_CACHE_DUALIO_OP(0, 2, NULL, 0),
|
||||
+ SPINAND_PAGE_READ_FROM_CACHE_X2_OP(0, 1, NULL, 0),
|
||||
+ SPINAND_PAGE_READ_FROM_CACHE_OP(true, 0, 1, NULL, 0),
|
||||
+ SPINAND_PAGE_READ_FROM_CACHE_OP(false, 0, 1, NULL, 0));
|
||||
+
|
||||
static SPINAND_OP_VARIANTS(write_cache_variants,
|
||||
SPINAND_PROG_LOAD_X4(true, 0, NULL, 0),
|
||||
SPINAND_PROG_LOAD(true, 0, NULL, 0));
|
||||
@@ -391,6 +399,46 @@ static const struct spinand_info gigadev
|
||||
&write_cache_variants,
|
||||
&update_cache_variants),
|
||||
SPINAND_HAS_QE_BIT,
|
||||
+ SPINAND_ECCINFO(&gd5fxgqx_variant2_ooblayout,
|
||||
+ gd5fxgq5xexxg_ecc_get_status)),
|
||||
+ SPINAND_INFO("GD5F2GQ5UExxG",
|
||||
+ SPINAND_ID(SPINAND_READID_METHOD_OPCODE_DUMMY, 0x52),
|
||||
+ NAND_MEMORG(1, 2048, 128, 64, 2048, 40, 1, 1, 1),
|
||||
+ NAND_ECCREQ(4, 512),
|
||||
+ SPINAND_INFO_OP_VARIANTS(&read_cache_variants_2gq5,
|
||||
+ &write_cache_variants,
|
||||
+ &update_cache_variants),
|
||||
+ SPINAND_HAS_QE_BIT,
|
||||
+ SPINAND_ECCINFO(&gd5fxgqx_variant2_ooblayout,
|
||||
+ gd5fxgq5xexxg_ecc_get_status)),
|
||||
+ SPINAND_INFO("GD5F2GQ5RExxG",
|
||||
+ SPINAND_ID(SPINAND_READID_METHOD_OPCODE_DUMMY, 0x42),
|
||||
+ NAND_MEMORG(1, 2048, 128, 64, 2048, 40, 1, 1, 1),
|
||||
+ NAND_ECCREQ(4, 512),
|
||||
+ SPINAND_INFO_OP_VARIANTS(&read_cache_variants_2gq5,
|
||||
+ &write_cache_variants,
|
||||
+ &update_cache_variants),
|
||||
+ SPINAND_HAS_QE_BIT,
|
||||
+ SPINAND_ECCINFO(&gd5fxgqx_variant2_ooblayout,
|
||||
+ gd5fxgq5xexxg_ecc_get_status)),
|
||||
+ SPINAND_INFO("GD5F4GQ6UExxG",
|
||||
+ SPINAND_ID(SPINAND_READID_METHOD_OPCODE_DUMMY, 0x55),
|
||||
+ NAND_MEMORG(1, 2048, 128, 64, 2048, 40, 1, 2, 1),
|
||||
+ NAND_ECCREQ(4, 512),
|
||||
+ SPINAND_INFO_OP_VARIANTS(&read_cache_variants_2gq5,
|
||||
+ &write_cache_variants,
|
||||
+ &update_cache_variants),
|
||||
+ SPINAND_HAS_QE_BIT,
|
||||
+ SPINAND_ECCINFO(&gd5fxgqx_variant2_ooblayout,
|
||||
+ gd5fxgq5xexxg_ecc_get_status)),
|
||||
+ SPINAND_INFO("GD5F4GQ6RExxG",
|
||||
+ SPINAND_ID(SPINAND_READID_METHOD_OPCODE_DUMMY, 0x45),
|
||||
+ NAND_MEMORG(1, 2048, 128, 64, 2048, 40, 1, 2, 1),
|
||||
+ NAND_ECCREQ(4, 512),
|
||||
+ SPINAND_INFO_OP_VARIANTS(&read_cache_variants_2gq5,
|
||||
+ &write_cache_variants,
|
||||
+ &update_cache_variants),
|
||||
+ SPINAND_HAS_QE_BIT,
|
||||
SPINAND_ECCINFO(&gd5fxgqx_variant2_ooblayout,
|
||||
gd5fxgq5xexxg_ecc_get_status)),
|
||||
};
|
@ -0,0 +1,91 @@
|
||||
From 54647cd003c08b714474a5b599a147ec6a160486 Mon Sep 17 00:00:00 2001
|
||||
From: Chuanhong Guo <gch981213@gmail.com>
|
||||
Date: Sun, 20 Mar 2022 18:00:01 +0800
|
||||
Subject: [PATCH 5/5] mtd: spinand: gigadevice: add support for GD5FxGM7xExxG
|
||||
|
||||
Add support for:
|
||||
GD5F{1,2}GM7{U,R}ExxG
|
||||
GD5F4GM8{U,R}ExxG
|
||||
|
||||
These are new 27nm counterparts for the GD5FxGQ4 chips from GigaDevice
|
||||
with 8b/512b on-die ECC capability.
|
||||
These chips (and currently supported GD5FxGQ5 chips) have QIO DTR
|
||||
instruction for reading page cache. It isn't added in this patch because
|
||||
I don't have a DTR spi controller for testing.
|
||||
|
||||
Signed-off-by: Chuanhong Guo <gch981213@gmail.com>
|
||||
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
|
||||
Link: https://lore.kernel.org/linux-mtd/20220320100001.247905-6-gch981213@gmail.com
|
||||
---
|
||||
drivers/mtd/nand/spi/gigadevice.c | 60 +++++++++++++++++++++++++++++++
|
||||
1 file changed, 60 insertions(+)
|
||||
|
||||
--- a/drivers/mtd/nand/spi/gigadevice.c
|
||||
+++ b/drivers/mtd/nand/spi/gigadevice.c
|
||||
@@ -441,6 +441,66 @@ static const struct spinand_info gigadev
|
||||
SPINAND_HAS_QE_BIT,
|
||||
SPINAND_ECCINFO(&gd5fxgqx_variant2_ooblayout,
|
||||
gd5fxgq5xexxg_ecc_get_status)),
|
||||
+ SPINAND_INFO("GD5F1GM7UExxG",
|
||||
+ SPINAND_ID(SPINAND_READID_METHOD_OPCODE_DUMMY, 0x91),
|
||||
+ NAND_MEMORG(1, 2048, 128, 64, 1024, 20, 1, 1, 1),
|
||||
+ NAND_ECCREQ(8, 512),
|
||||
+ SPINAND_INFO_OP_VARIANTS(&read_cache_variants_1gq5,
|
||||
+ &write_cache_variants,
|
||||
+ &update_cache_variants),
|
||||
+ SPINAND_HAS_QE_BIT,
|
||||
+ SPINAND_ECCINFO(&gd5fxgqx_variant2_ooblayout,
|
||||
+ gd5fxgq4uexxg_ecc_get_status)),
|
||||
+ SPINAND_INFO("GD5F1GM7RExxG",
|
||||
+ SPINAND_ID(SPINAND_READID_METHOD_OPCODE_DUMMY, 0x81),
|
||||
+ NAND_MEMORG(1, 2048, 128, 64, 1024, 20, 1, 1, 1),
|
||||
+ NAND_ECCREQ(8, 512),
|
||||
+ SPINAND_INFO_OP_VARIANTS(&read_cache_variants_1gq5,
|
||||
+ &write_cache_variants,
|
||||
+ &update_cache_variants),
|
||||
+ SPINAND_HAS_QE_BIT,
|
||||
+ SPINAND_ECCINFO(&gd5fxgqx_variant2_ooblayout,
|
||||
+ gd5fxgq4uexxg_ecc_get_status)),
|
||||
+ SPINAND_INFO("GD5F2GM7UExxG",
|
||||
+ SPINAND_ID(SPINAND_READID_METHOD_OPCODE_DUMMY, 0x92),
|
||||
+ NAND_MEMORG(1, 2048, 128, 64, 2048, 40, 1, 1, 1),
|
||||
+ NAND_ECCREQ(8, 512),
|
||||
+ SPINAND_INFO_OP_VARIANTS(&read_cache_variants_1gq5,
|
||||
+ &write_cache_variants,
|
||||
+ &update_cache_variants),
|
||||
+ SPINAND_HAS_QE_BIT,
|
||||
+ SPINAND_ECCINFO(&gd5fxgqx_variant2_ooblayout,
|
||||
+ gd5fxgq4uexxg_ecc_get_status)),
|
||||
+ SPINAND_INFO("GD5F2GM7RExxG",
|
||||
+ SPINAND_ID(SPINAND_READID_METHOD_OPCODE_DUMMY, 0x82),
|
||||
+ NAND_MEMORG(1, 2048, 128, 64, 2048, 40, 1, 1, 1),
|
||||
+ NAND_ECCREQ(8, 512),
|
||||
+ SPINAND_INFO_OP_VARIANTS(&read_cache_variants_1gq5,
|
||||
+ &write_cache_variants,
|
||||
+ &update_cache_variants),
|
||||
+ SPINAND_HAS_QE_BIT,
|
||||
+ SPINAND_ECCINFO(&gd5fxgqx_variant2_ooblayout,
|
||||
+ gd5fxgq4uexxg_ecc_get_status)),
|
||||
+ SPINAND_INFO("GD5F4GM8UExxG",
|
||||
+ SPINAND_ID(SPINAND_READID_METHOD_OPCODE_DUMMY, 0x95),
|
||||
+ NAND_MEMORG(1, 2048, 128, 64, 4096, 80, 1, 1, 1),
|
||||
+ NAND_ECCREQ(8, 512),
|
||||
+ SPINAND_INFO_OP_VARIANTS(&read_cache_variants_1gq5,
|
||||
+ &write_cache_variants,
|
||||
+ &update_cache_variants),
|
||||
+ SPINAND_HAS_QE_BIT,
|
||||
+ SPINAND_ECCINFO(&gd5fxgqx_variant2_ooblayout,
|
||||
+ gd5fxgq4uexxg_ecc_get_status)),
|
||||
+ SPINAND_INFO("GD5F4GM8RExxG",
|
||||
+ SPINAND_ID(SPINAND_READID_METHOD_OPCODE_DUMMY, 0x85),
|
||||
+ NAND_MEMORG(1, 2048, 128, 64, 4096, 80, 1, 1, 1),
|
||||
+ NAND_ECCREQ(8, 512),
|
||||
+ SPINAND_INFO_OP_VARIANTS(&read_cache_variants_1gq5,
|
||||
+ &write_cache_variants,
|
||||
+ &update_cache_variants),
|
||||
+ SPINAND_HAS_QE_BIT,
|
||||
+ SPINAND_ECCINFO(&gd5fxgqx_variant2_ooblayout,
|
||||
+ gd5fxgq4uexxg_ecc_get_status)),
|
||||
};
|
||||
|
||||
static const struct spinand_manufacturer_ops gigadevice_spinand_manuf_ops = {
|
@ -0,0 +1,229 @@
|
||||
From aec4d5f5ffd0f0092bd9dc21ea90e0bc237d4b74 Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Rafa=C5=82=20Mi=C5=82ecki?= <rafal@milecki.pl>
|
||||
Date: Sat, 15 Oct 2022 11:29:50 +0200
|
||||
Subject: [PATCH] mtd: parsers: add TP-Link SafeLoader partitions table parser
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
This parser deals with most TP-Link home routers. It reads info about
|
||||
partitions and registers them in the MTD subsystem.
|
||||
|
||||
Example from TP-Link Archer C5 V2:
|
||||
|
||||
spi-nor spi0.0: s25fl128s1 (16384 Kbytes)
|
||||
15 tplink-safeloader partitions found on MTD device spi0.0
|
||||
Creating 15 MTD partitions on "spi0.0":
|
||||
0x000000000000-0x000000040000 : "fs-uboot"
|
||||
0x000000040000-0x000000440000 : "os-image"
|
||||
0x000000440000-0x000000e40000 : "rootfs"
|
||||
0x000000e40000-0x000000e40200 : "default-mac"
|
||||
0x000000e40200-0x000000e40400 : "pin"
|
||||
0x000000e40400-0x000000e40600 : "product-info"
|
||||
0x000000e50000-0x000000e60000 : "partition-table"
|
||||
0x000000e60000-0x000000e60200 : "soft-version"
|
||||
0x000000e61000-0x000000e70000 : "support-list"
|
||||
0x000000e70000-0x000000e80000 : "profile"
|
||||
0x000000e80000-0x000000e90000 : "default-config"
|
||||
0x000000e90000-0x000000ee0000 : "user-config"
|
||||
0x000000ee0000-0x000000fe0000 : "log"
|
||||
0x000000fe0000-0x000000ff0000 : "radio_bk"
|
||||
0x000000ff0000-0x000001000000 : "radio"
|
||||
|
||||
Signed-off-by: Rafał Miłecki <rafal@milecki.pl>
|
||||
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
|
||||
Link: https://lore.kernel.org/linux-mtd/20221015092950.27467-2-zajec5@gmail.com
|
||||
---
|
||||
drivers/mtd/parsers/Kconfig | 15 +++
|
||||
drivers/mtd/parsers/Makefile | 1 +
|
||||
drivers/mtd/parsers/tplink_safeloader.c | 150 ++++++++++++++++++++++++
|
||||
3 files changed, 166 insertions(+)
|
||||
create mode 100644 drivers/mtd/parsers/tplink_safeloader.c
|
||||
|
||||
--- a/drivers/mtd/parsers/Kconfig
|
||||
+++ b/drivers/mtd/parsers/Kconfig
|
||||
@@ -113,6 +113,21 @@ config MTD_AFS_PARTS
|
||||
for your particular device. It won't happen automatically. The
|
||||
'physmap' map driver (CONFIG_MTD_PHYSMAP) does this, for example.
|
||||
|
||||
+config MTD_PARSER_TPLINK_SAFELOADER
|
||||
+ tristate "TP-Link Safeloader partitions parser"
|
||||
+ depends on MTD && (ARCH_BCM_5301X || ATH79 || SOC_MT7620 || SOC_MT7621 || COMPILE_TEST)
|
||||
+ help
|
||||
+ TP-Link home routers use flash partitions to store various data. Info
|
||||
+ about flash space layout is stored in a partitions table using a
|
||||
+ custom ASCII-based format.
|
||||
+
|
||||
+ That format was first found in devices with SafeLoader bootloader and
|
||||
+ was named after it. Later it was adapted to CFE and U-Boot
|
||||
+ bootloaders.
|
||||
+
|
||||
+ This driver reads partitions table, parses it and creates MTD
|
||||
+ partitions.
|
||||
+
|
||||
config MTD_PARSER_TRX
|
||||
tristate "Parser for TRX format partitions"
|
||||
depends on MTD && (BCM47XX || ARCH_BCM_5301X || ARCH_MEDIATEK || RALINK || COMPILE_TEST)
|
||||
--- a/drivers/mtd/parsers/Makefile
|
||||
+++ b/drivers/mtd/parsers/Makefile
|
||||
@@ -9,6 +9,7 @@ ofpart-$(CONFIG_MTD_OF_PARTS_BCM4908) +=
|
||||
ofpart-$(CONFIG_MTD_OF_PARTS_LINKSYS_NS)+= ofpart_linksys_ns.o
|
||||
obj-$(CONFIG_MTD_PARSER_IMAGETAG) += parser_imagetag.o
|
||||
obj-$(CONFIG_MTD_AFS_PARTS) += afs.o
|
||||
+obj-$(CONFIG_MTD_PARSER_TPLINK_SAFELOADER) += tplink_safeloader.o
|
||||
obj-$(CONFIG_MTD_PARSER_TRX) += parser_trx.o
|
||||
obj-$(CONFIG_MTD_SERCOMM_PARTS) += scpart.o
|
||||
obj-$(CONFIG_MTD_SHARPSL_PARTS) += sharpslpart.o
|
||||
--- /dev/null
|
||||
+++ b/drivers/mtd/parsers/tplink_safeloader.c
|
||||
@@ -0,0 +1,150 @@
|
||||
+// SPDX-License-Identifier: GPL-2.0-only
|
||||
+/*
|
||||
+ * Copyright © 2022 Rafał Miłecki <rafal@milecki.pl>
|
||||
+ */
|
||||
+
|
||||
+#include <linux/kernel.h>
|
||||
+#include <linux/module.h>
|
||||
+#include <linux/mtd/mtd.h>
|
||||
+#include <linux/mtd/partitions.h>
|
||||
+#include <linux/of.h>
|
||||
+#include <linux/slab.h>
|
||||
+
|
||||
+#define TPLINK_SAFELOADER_DATA_OFFSET 4
|
||||
+#define TPLINK_SAFELOADER_MAX_PARTS 32
|
||||
+
|
||||
+struct safeloader_cmn_header {
|
||||
+ __be32 size;
|
||||
+ uint32_t unused;
|
||||
+} __packed;
|
||||
+
|
||||
+static void *mtd_parser_tplink_safeloader_read_table(struct mtd_info *mtd)
|
||||
+{
|
||||
+ struct safeloader_cmn_header hdr;
|
||||
+ struct device_node *np;
|
||||
+ size_t bytes_read;
|
||||
+ size_t offset;
|
||||
+ size_t size;
|
||||
+ char *buf;
|
||||
+ int err;
|
||||
+
|
||||
+ np = mtd_get_of_node(mtd);
|
||||
+ if (mtd_is_partition(mtd))
|
||||
+ of_node_get(np);
|
||||
+ else
|
||||
+ np = of_get_child_by_name(np, "partitions");
|
||||
+
|
||||
+ if (of_property_read_u32(np, "partitions-table-offset", (u32 *)&offset)) {
|
||||
+ pr_err("Failed to get partitions table offset\n");
|
||||
+ goto err_put;
|
||||
+ }
|
||||
+
|
||||
+ err = mtd_read(mtd, offset, sizeof(hdr), &bytes_read, (uint8_t *)&hdr);
|
||||
+ if (err && !mtd_is_bitflip(err)) {
|
||||
+ pr_err("Failed to read from %s at 0x%zx\n", mtd->name, offset);
|
||||
+ goto err_put;
|
||||
+ }
|
||||
+
|
||||
+ size = be32_to_cpu(hdr.size);
|
||||
+
|
||||
+ buf = kmalloc(size + 1, GFP_KERNEL);
|
||||
+ if (!buf)
|
||||
+ goto err_put;
|
||||
+
|
||||
+ err = mtd_read(mtd, offset + sizeof(hdr), size, &bytes_read, buf);
|
||||
+ if (err && !mtd_is_bitflip(err)) {
|
||||
+ pr_err("Failed to read from %s at 0x%zx\n", mtd->name, offset + sizeof(hdr));
|
||||
+ goto err_kfree;
|
||||
+ }
|
||||
+
|
||||
+ buf[size] = '\0';
|
||||
+
|
||||
+ of_node_put(np);
|
||||
+
|
||||
+ return buf;
|
||||
+
|
||||
+err_kfree:
|
||||
+ kfree(buf);
|
||||
+err_put:
|
||||
+ of_node_put(np);
|
||||
+ return NULL;
|
||||
+}
|
||||
+
|
||||
+static int mtd_parser_tplink_safeloader_parse(struct mtd_info *mtd,
|
||||
+ const struct mtd_partition **pparts,
|
||||
+ struct mtd_part_parser_data *data)
|
||||
+{
|
||||
+ struct mtd_partition *parts;
|
||||
+ char name[65];
|
||||
+ size_t offset;
|
||||
+ size_t bytes;
|
||||
+ char *buf;
|
||||
+ int idx;
|
||||
+ int err;
|
||||
+
|
||||
+ parts = kcalloc(TPLINK_SAFELOADER_MAX_PARTS, sizeof(*parts), GFP_KERNEL);
|
||||
+ if (!parts) {
|
||||
+ err = -ENOMEM;
|
||||
+ goto err_out;
|
||||
+ }
|
||||
+
|
||||
+ buf = mtd_parser_tplink_safeloader_read_table(mtd);
|
||||
+ if (!buf) {
|
||||
+ err = -ENOENT;
|
||||
+ goto err_out;
|
||||
+ }
|
||||
+
|
||||
+ for (idx = 0, offset = TPLINK_SAFELOADER_DATA_OFFSET;
|
||||
+ idx < TPLINK_SAFELOADER_MAX_PARTS &&
|
||||
+ sscanf(buf + offset, "partition %64s base 0x%llx size 0x%llx%zn\n",
|
||||
+ name, &parts[idx].offset, &parts[idx].size, &bytes) == 3;
|
||||
+ idx++, offset += bytes + 1) {
|
||||
+ parts[idx].name = kstrdup(name, GFP_KERNEL);
|
||||
+ if (!parts[idx].name) {
|
||||
+ err = -ENOMEM;
|
||||
+ goto err_free;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ if (idx == TPLINK_SAFELOADER_MAX_PARTS)
|
||||
+ pr_warn("Reached maximum number of partitions!\n");
|
||||
+
|
||||
+ kfree(buf);
|
||||
+
|
||||
+ *pparts = parts;
|
||||
+
|
||||
+ return idx;
|
||||
+
|
||||
+err_free:
|
||||
+ for (idx -= 1; idx >= 0; idx--)
|
||||
+ kfree(parts[idx].name);
|
||||
+err_out:
|
||||
+ return err;
|
||||
+};
|
||||
+
|
||||
+static void mtd_parser_tplink_safeloader_cleanup(const struct mtd_partition *pparts,
|
||||
+ int nr_parts)
|
||||
+{
|
||||
+ int i;
|
||||
+
|
||||
+ for (i = 0; i < nr_parts; i++)
|
||||
+ kfree(pparts[i].name);
|
||||
+
|
||||
+ kfree(pparts);
|
||||
+}
|
||||
+
|
||||
+static const struct of_device_id mtd_parser_tplink_safeloader_of_match_table[] = {
|
||||
+ { .compatible = "tplink,safeloader-partitions" },
|
||||
+ {},
|
||||
+};
|
||||
+MODULE_DEVICE_TABLE(of, mtd_parser_tplink_safeloader_of_match_table);
|
||||
+
|
||||
+static struct mtd_part_parser mtd_parser_tplink_safeloader = {
|
||||
+ .parse_fn = mtd_parser_tplink_safeloader_parse,
|
||||
+ .cleanup = mtd_parser_tplink_safeloader_cleanup,
|
||||
+ .name = "tplink-safeloader",
|
||||
+ .of_match_table = mtd_parser_tplink_safeloader_of_match_table,
|
||||
+};
|
||||
+module_mtd_part_parser(mtd_parser_tplink_safeloader);
|
||||
+
|
||||
+MODULE_LICENSE("GPL");
|
@ -0,0 +1,49 @@
|
||||
From 6abef37d16d0c570ef5a149e63762fba2a30804b Mon Sep 17 00:00:00 2001
|
||||
From: "Leon M. George" <leon@georgemail.eu>
|
||||
Date: Wed, 30 Mar 2022 16:16:56 +0200
|
||||
Subject: [PATCH] mtd: spi-nor: support eon en25qh256a variant
|
||||
|
||||
The EN25QH256A variant of the EN25QH256 doesn't initialize correctly from SFDP
|
||||
alone and only accesses memory below 8m (addr_width is 4 but read_opcode takes
|
||||
only 3 bytes).
|
||||
|
||||
Set SNOR_F_4B_OPCODES if the flash chip variant was detected using hwcaps.
|
||||
|
||||
The fix submitted upstream uses the PARSE_SFDP initializer that is not
|
||||
available in the kernel used with Openwrt.
|
||||
|
||||
Signed-off-by: Leon M. George <leon@georgemail.eu>
|
||||
---
|
||||
drivers/mtd/spi-nor/eon.c | 11 +++++++++++
|
||||
1 file changed, 11 insertions(+)
|
||||
|
||||
--- a/drivers/mtd/spi-nor/eon.c
|
||||
+++ b/drivers/mtd/spi-nor/eon.c
|
||||
@@ -8,6 +8,16 @@
|
||||
|
||||
#include "core.h"
|
||||
|
||||
+static void en25qh256_post_sfdp_fixups(struct spi_nor *nor)
|
||||
+{
|
||||
+ if (nor->params->hwcaps.mask & SNOR_HWCAPS_READ_1_1_4)
|
||||
+ nor->flags |= SNOR_F_4B_OPCODES;
|
||||
+}
|
||||
+
|
||||
+static const struct spi_nor_fixups en25qh256_fixups = {
|
||||
+ .post_sfdp = en25qh256_post_sfdp_fixups,
|
||||
+};
|
||||
+
|
||||
static const struct flash_info eon_parts[] = {
|
||||
/* EON -- en25xxx */
|
||||
{ "en25f32", INFO(0x1c3116, 0, 64 * 1024, 64, SECT_4K) },
|
||||
@@ -23,7 +33,9 @@ static const struct flash_info eon_parts
|
||||
{ "en25qh64", INFO(0x1c7017, 0, 64 * 1024, 128,
|
||||
SECT_4K | SPI_NOR_DUAL_READ) },
|
||||
{ "en25qh128", INFO(0x1c7018, 0, 64 * 1024, 256, 0) },
|
||||
- { "en25qh256", INFO(0x1c7019, 0, 64 * 1024, 512, 0) },
|
||||
+ { "en25qh256", INFO(0x1c7019, 0, 64 * 1024, 512,
|
||||
+ SPI_NOR_DUAL_READ)
|
||||
+ .fixups = &en25qh256_fixups },
|
||||
{ "en25s64", INFO(0x1c3817, 0, 64 * 1024, 128, SECT_4K) },
|
||||
};
|
||||
|
@ -0,0 +1,73 @@
|
||||
From e237285113963bd1dd2e925770aa8b3aa8a1894c Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Micha=C5=82=20K=C4=99pie=C5=84?= <kernel@kempniu.pl>
|
||||
Date: Wed, 29 Jun 2022 14:57:34 +0200
|
||||
Subject: [PATCH 1/4] mtd: track maximum number of bitflips for each read
|
||||
request
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
mtd_read_oob() callers are currently oblivious to the details of ECC
|
||||
errors detected during the read operation - they only learn (through the
|
||||
return value) whether any corrected bitflips or uncorrectable errors
|
||||
occurred. More detailed ECC information can be useful to user-space
|
||||
applications for making better-informed choices about moving data
|
||||
around.
|
||||
|
||||
Extend struct mtd_oob_ops with a pointer to a newly-introduced struct
|
||||
mtd_req_stats and set its 'max_bitflips' field to the maximum number of
|
||||
bitflips found in a single ECC step during the read operation performed
|
||||
by mtd_read_oob(). This is a prerequisite for ultimately passing that
|
||||
value back to user space.
|
||||
|
||||
Suggested-by: Boris Brezillon <boris.brezillon@collabora.com>
|
||||
Signed-off-by: Michał Kępień <kernel@kempniu.pl>
|
||||
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
|
||||
Link: https://lore.kernel.org/linux-mtd/20220629125737.14418-2-kernel@kempniu.pl
|
||||
---
|
||||
drivers/mtd/mtdcore.c | 5 +++++
|
||||
include/linux/mtd/mtd.h | 5 +++++
|
||||
2 files changed, 10 insertions(+)
|
||||
|
||||
--- a/drivers/mtd/mtdcore.c
|
||||
+++ b/drivers/mtd/mtdcore.c
|
||||
@@ -1676,6 +1676,9 @@ int mtd_read_oob(struct mtd_info *mtd, l
|
||||
if (!master->_read_oob && (!master->_read || ops->oobbuf))
|
||||
return -EOPNOTSUPP;
|
||||
|
||||
+ if (ops->stats)
|
||||
+ memset(ops->stats, 0, sizeof(*ops->stats));
|
||||
+
|
||||
if (mtd->flags & MTD_SLC_ON_MLC_EMULATION)
|
||||
ret_code = mtd_io_emulated_slc(mtd, from, true, ops);
|
||||
else
|
||||
@@ -1693,6 +1696,8 @@ int mtd_read_oob(struct mtd_info *mtd, l
|
||||
return ret_code;
|
||||
if (mtd->ecc_strength == 0)
|
||||
return 0; /* device lacks ecc */
|
||||
+ if (ops->stats)
|
||||
+ ops->stats->max_bitflips = ret_code;
|
||||
return ret_code >= mtd->bitflip_threshold ? -EUCLEAN : 0;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(mtd_read_oob);
|
||||
--- a/include/linux/mtd/mtd.h
|
||||
+++ b/include/linux/mtd/mtd.h
|
||||
@@ -40,6 +40,10 @@ struct mtd_erase_region_info {
|
||||
unsigned long *lockmap; /* If keeping bitmap of locks */
|
||||
};
|
||||
|
||||
+struct mtd_req_stats {
|
||||
+ unsigned int max_bitflips;
|
||||
+};
|
||||
+
|
||||
/**
|
||||
* struct mtd_oob_ops - oob operation operands
|
||||
* @mode: operation mode
|
||||
@@ -70,6 +74,7 @@ struct mtd_oob_ops {
|
||||
uint32_t ooboffs;
|
||||
uint8_t *datbuf;
|
||||
uint8_t *oobbuf;
|
||||
+ struct mtd_req_stats *stats;
|
||||
};
|
||||
|
||||
#define MTD_MAX_OOBFREE_ENTRIES_LARGE 32
|
@ -0,0 +1,325 @@
|
||||
From e97709c9d18903f5acd5fbe2985dd054da0432b1 Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Micha=C5=82=20K=C4=99pie=C5=84?= <kernel@kempniu.pl>
|
||||
Date: Wed, 29 Jun 2022 14:57:35 +0200
|
||||
Subject: [PATCH 2/4] mtd: always initialize 'stats' in struct mtd_oob_ops
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
As the 'stats' field in struct mtd_oob_ops is used in conditional
|
||||
expressions, ensure it is always zero-initialized in all such structures
|
||||
to prevent random stack garbage from being interpreted as a pointer.
|
||||
|
||||
Strictly speaking, this problem currently only needs to be fixed for
|
||||
struct mtd_oob_ops structures subsequently passed to mtd_read_oob().
|
||||
However, this commit goes a step further and makes all instances of
|
||||
struct mtd_oob_ops in the tree zero-initialized, in hope of preventing
|
||||
future problems, e.g. if struct mtd_req_stats gets extended with write
|
||||
statistics at some point.
|
||||
|
||||
Signed-off-by: Michał Kępień <kernel@kempniu.pl>
|
||||
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
|
||||
Link: https://lore.kernel.org/linux-mtd/20220629125737.14418-3-kernel@kempniu.pl
|
||||
---
|
||||
drivers/mtd/inftlcore.c | 6 +++---
|
||||
drivers/mtd/mtdswap.c | 6 +++---
|
||||
drivers/mtd/nand/onenand/onenand_base.c | 4 ++--
|
||||
drivers/mtd/nand/onenand/onenand_bbt.c | 2 +-
|
||||
drivers/mtd/nand/raw/nand_bbt.c | 8 ++++----
|
||||
drivers/mtd/nand/raw/sm_common.c | 2 +-
|
||||
drivers/mtd/nftlcore.c | 6 +++---
|
||||
drivers/mtd/sm_ftl.c | 4 ++--
|
||||
drivers/mtd/ssfdc.c | 2 +-
|
||||
drivers/mtd/tests/nandbiterrs.c | 2 +-
|
||||
drivers/mtd/tests/oobtest.c | 8 ++++----
|
||||
drivers/mtd/tests/readtest.c | 2 +-
|
||||
fs/jffs2/wbuf.c | 6 +++---
|
||||
13 files changed, 29 insertions(+), 29 deletions(-)
|
||||
|
||||
--- a/drivers/mtd/inftlcore.c
|
||||
+++ b/drivers/mtd/inftlcore.c
|
||||
@@ -136,7 +136,7 @@ static void inftl_remove_dev(struct mtd_
|
||||
int inftl_read_oob(struct mtd_info *mtd, loff_t offs, size_t len,
|
||||
size_t *retlen, uint8_t *buf)
|
||||
{
|
||||
- struct mtd_oob_ops ops;
|
||||
+ struct mtd_oob_ops ops = { };
|
||||
int res;
|
||||
|
||||
ops.mode = MTD_OPS_PLACE_OOB;
|
||||
@@ -156,7 +156,7 @@ int inftl_read_oob(struct mtd_info *mtd,
|
||||
int inftl_write_oob(struct mtd_info *mtd, loff_t offs, size_t len,
|
||||
size_t *retlen, uint8_t *buf)
|
||||
{
|
||||
- struct mtd_oob_ops ops;
|
||||
+ struct mtd_oob_ops ops = { };
|
||||
int res;
|
||||
|
||||
ops.mode = MTD_OPS_PLACE_OOB;
|
||||
@@ -176,7 +176,7 @@ int inftl_write_oob(struct mtd_info *mtd
|
||||
static int inftl_write(struct mtd_info *mtd, loff_t offs, size_t len,
|
||||
size_t *retlen, uint8_t *buf, uint8_t *oob)
|
||||
{
|
||||
- struct mtd_oob_ops ops;
|
||||
+ struct mtd_oob_ops ops = { };
|
||||
int res;
|
||||
|
||||
ops.mode = MTD_OPS_PLACE_OOB;
|
||||
--- a/drivers/mtd/mtdswap.c
|
||||
+++ b/drivers/mtd/mtdswap.c
|
||||
@@ -323,7 +323,7 @@ static int mtdswap_read_markers(struct m
|
||||
struct mtdswap_oobdata *data, *data2;
|
||||
int ret;
|
||||
loff_t offset;
|
||||
- struct mtd_oob_ops ops;
|
||||
+ struct mtd_oob_ops ops = { };
|
||||
|
||||
offset = mtdswap_eb_offset(d, eb);
|
||||
|
||||
@@ -370,7 +370,7 @@ static int mtdswap_write_marker(struct m
|
||||
struct mtdswap_oobdata n;
|
||||
int ret;
|
||||
loff_t offset;
|
||||
- struct mtd_oob_ops ops;
|
||||
+ struct mtd_oob_ops ops = { };
|
||||
|
||||
ops.ooboffs = 0;
|
||||
ops.oobbuf = (uint8_t *)&n;
|
||||
@@ -879,7 +879,7 @@ static unsigned int mtdswap_eblk_passes(
|
||||
loff_t base, pos;
|
||||
unsigned int *p1 = (unsigned int *)d->page_buf;
|
||||
unsigned char *p2 = (unsigned char *)d->oob_buf;
|
||||
- struct mtd_oob_ops ops;
|
||||
+ struct mtd_oob_ops ops = { };
|
||||
int ret;
|
||||
|
||||
ops.mode = MTD_OPS_AUTO_OOB;
|
||||
--- a/drivers/mtd/nand/onenand/onenand_base.c
|
||||
+++ b/drivers/mtd/nand/onenand/onenand_base.c
|
||||
@@ -2935,7 +2935,7 @@ static int do_otp_write(struct mtd_info
|
||||
struct onenand_chip *this = mtd->priv;
|
||||
unsigned char *pbuf = buf;
|
||||
int ret;
|
||||
- struct mtd_oob_ops ops;
|
||||
+ struct mtd_oob_ops ops = { };
|
||||
|
||||
/* Force buffer page aligned */
|
||||
if (len < mtd->writesize) {
|
||||
@@ -2977,7 +2977,7 @@ static int do_otp_lock(struct mtd_info *
|
||||
size_t *retlen, u_char *buf)
|
||||
{
|
||||
struct onenand_chip *this = mtd->priv;
|
||||
- struct mtd_oob_ops ops;
|
||||
+ struct mtd_oob_ops ops = { };
|
||||
int ret;
|
||||
|
||||
if (FLEXONENAND(this)) {
|
||||
--- a/drivers/mtd/nand/onenand/onenand_bbt.c
|
||||
+++ b/drivers/mtd/nand/onenand/onenand_bbt.c
|
||||
@@ -61,7 +61,7 @@ static int create_bbt(struct mtd_info *m
|
||||
int startblock;
|
||||
loff_t from;
|
||||
size_t readlen, ooblen;
|
||||
- struct mtd_oob_ops ops;
|
||||
+ struct mtd_oob_ops ops = { };
|
||||
int rgn;
|
||||
|
||||
printk(KERN_INFO "Scanning device for bad blocks\n");
|
||||
--- a/drivers/mtd/nand/raw/nand_bbt.c
|
||||
+++ b/drivers/mtd/nand/raw/nand_bbt.c
|
||||
@@ -313,7 +313,7 @@ static int scan_read_oob(struct nand_chi
|
||||
size_t len)
|
||||
{
|
||||
struct mtd_info *mtd = nand_to_mtd(this);
|
||||
- struct mtd_oob_ops ops;
|
||||
+ struct mtd_oob_ops ops = { };
|
||||
int res, ret = 0;
|
||||
|
||||
ops.mode = MTD_OPS_PLACE_OOB;
|
||||
@@ -354,7 +354,7 @@ static int scan_write_bbt(struct nand_ch
|
||||
uint8_t *buf, uint8_t *oob)
|
||||
{
|
||||
struct mtd_info *mtd = nand_to_mtd(this);
|
||||
- struct mtd_oob_ops ops;
|
||||
+ struct mtd_oob_ops ops = { };
|
||||
|
||||
ops.mode = MTD_OPS_PLACE_OOB;
|
||||
ops.ooboffs = 0;
|
||||
@@ -416,7 +416,7 @@ static int scan_block_fast(struct nand_c
|
||||
{
|
||||
struct mtd_info *mtd = nand_to_mtd(this);
|
||||
|
||||
- struct mtd_oob_ops ops;
|
||||
+ struct mtd_oob_ops ops = { };
|
||||
int ret, page_offset;
|
||||
|
||||
ops.ooblen = mtd->oobsize;
|
||||
@@ -756,7 +756,7 @@ static int write_bbt(struct nand_chip *t
|
||||
uint8_t rcode = td->reserved_block_code;
|
||||
size_t retlen, len = 0;
|
||||
loff_t to;
|
||||
- struct mtd_oob_ops ops;
|
||||
+ struct mtd_oob_ops ops = { };
|
||||
|
||||
ops.ooblen = mtd->oobsize;
|
||||
ops.ooboffs = 0;
|
||||
--- a/drivers/mtd/nand/raw/sm_common.c
|
||||
+++ b/drivers/mtd/nand/raw/sm_common.c
|
||||
@@ -99,7 +99,7 @@ static const struct mtd_ooblayout_ops oo
|
||||
static int sm_block_markbad(struct nand_chip *chip, loff_t ofs)
|
||||
{
|
||||
struct mtd_info *mtd = nand_to_mtd(chip);
|
||||
- struct mtd_oob_ops ops;
|
||||
+ struct mtd_oob_ops ops = { };
|
||||
struct sm_oob oob;
|
||||
int ret;
|
||||
|
||||
--- a/drivers/mtd/nftlcore.c
|
||||
+++ b/drivers/mtd/nftlcore.c
|
||||
@@ -124,7 +124,7 @@ int nftl_read_oob(struct mtd_info *mtd,
|
||||
size_t *retlen, uint8_t *buf)
|
||||
{
|
||||
loff_t mask = mtd->writesize - 1;
|
||||
- struct mtd_oob_ops ops;
|
||||
+ struct mtd_oob_ops ops = { };
|
||||
int res;
|
||||
|
||||
ops.mode = MTD_OPS_PLACE_OOB;
|
||||
@@ -145,7 +145,7 @@ int nftl_write_oob(struct mtd_info *mtd,
|
||||
size_t *retlen, uint8_t *buf)
|
||||
{
|
||||
loff_t mask = mtd->writesize - 1;
|
||||
- struct mtd_oob_ops ops;
|
||||
+ struct mtd_oob_ops ops = { };
|
||||
int res;
|
||||
|
||||
ops.mode = MTD_OPS_PLACE_OOB;
|
||||
@@ -168,7 +168,7 @@ static int nftl_write(struct mtd_info *m
|
||||
size_t *retlen, uint8_t *buf, uint8_t *oob)
|
||||
{
|
||||
loff_t mask = mtd->writesize - 1;
|
||||
- struct mtd_oob_ops ops;
|
||||
+ struct mtd_oob_ops ops = { };
|
||||
int res;
|
||||
|
||||
ops.mode = MTD_OPS_PLACE_OOB;
|
||||
--- a/drivers/mtd/sm_ftl.c
|
||||
+++ b/drivers/mtd/sm_ftl.c
|
||||
@@ -239,7 +239,7 @@ static int sm_read_sector(struct sm_ftl
|
||||
uint8_t *buffer, struct sm_oob *oob)
|
||||
{
|
||||
struct mtd_info *mtd = ftl->trans->mtd;
|
||||
- struct mtd_oob_ops ops;
|
||||
+ struct mtd_oob_ops ops = { };
|
||||
struct sm_oob tmp_oob;
|
||||
int ret = -EIO;
|
||||
int try = 0;
|
||||
@@ -323,7 +323,7 @@ static int sm_write_sector(struct sm_ftl
|
||||
int zone, int block, int boffset,
|
||||
uint8_t *buffer, struct sm_oob *oob)
|
||||
{
|
||||
- struct mtd_oob_ops ops;
|
||||
+ struct mtd_oob_ops ops = { };
|
||||
struct mtd_info *mtd = ftl->trans->mtd;
|
||||
int ret;
|
||||
|
||||
--- a/drivers/mtd/ssfdc.c
|
||||
+++ b/drivers/mtd/ssfdc.c
|
||||
@@ -163,7 +163,7 @@ static int read_physical_sector(struct m
|
||||
/* Read redundancy area (wrapper to MTD_READ_OOB */
|
||||
static int read_raw_oob(struct mtd_info *mtd, loff_t offs, uint8_t *buf)
|
||||
{
|
||||
- struct mtd_oob_ops ops;
|
||||
+ struct mtd_oob_ops ops = { };
|
||||
int ret;
|
||||
|
||||
ops.mode = MTD_OPS_RAW;
|
||||
--- a/drivers/mtd/tests/nandbiterrs.c
|
||||
+++ b/drivers/mtd/tests/nandbiterrs.c
|
||||
@@ -99,7 +99,7 @@ static int write_page(int log)
|
||||
static int rewrite_page(int log)
|
||||
{
|
||||
int err = 0;
|
||||
- struct mtd_oob_ops ops;
|
||||
+ struct mtd_oob_ops ops = { };
|
||||
|
||||
if (log)
|
||||
pr_info("rewrite page\n");
|
||||
--- a/drivers/mtd/tests/oobtest.c
|
||||
+++ b/drivers/mtd/tests/oobtest.c
|
||||
@@ -56,7 +56,7 @@ static void do_vary_offset(void)
|
||||
static int write_eraseblock(int ebnum)
|
||||
{
|
||||
int i;
|
||||
- struct mtd_oob_ops ops;
|
||||
+ struct mtd_oob_ops ops = { };
|
||||
int err = 0;
|
||||
loff_t addr = (loff_t)ebnum * mtd->erasesize;
|
||||
|
||||
@@ -165,7 +165,7 @@ static size_t memffshow(loff_t addr, lof
|
||||
static int verify_eraseblock(int ebnum)
|
||||
{
|
||||
int i;
|
||||
- struct mtd_oob_ops ops;
|
||||
+ struct mtd_oob_ops ops = { };
|
||||
int err = 0;
|
||||
loff_t addr = (loff_t)ebnum * mtd->erasesize;
|
||||
size_t bitflips;
|
||||
@@ -260,7 +260,7 @@ static int verify_eraseblock(int ebnum)
|
||||
|
||||
static int verify_eraseblock_in_one_go(int ebnum)
|
||||
{
|
||||
- struct mtd_oob_ops ops;
|
||||
+ struct mtd_oob_ops ops = { };
|
||||
int err = 0;
|
||||
loff_t addr = (loff_t)ebnum * mtd->erasesize;
|
||||
size_t len = mtd->oobavail * pgcnt;
|
||||
@@ -338,7 +338,7 @@ static int __init mtd_oobtest_init(void)
|
||||
int err = 0;
|
||||
unsigned int i;
|
||||
uint64_t tmp;
|
||||
- struct mtd_oob_ops ops;
|
||||
+ struct mtd_oob_ops ops = { };
|
||||
loff_t addr = 0, addr0;
|
||||
|
||||
printk(KERN_INFO "\n");
|
||||
--- a/drivers/mtd/tests/readtest.c
|
||||
+++ b/drivers/mtd/tests/readtest.c
|
||||
@@ -47,7 +47,7 @@ static int read_eraseblock_by_page(int e
|
||||
err = ret;
|
||||
}
|
||||
if (mtd->oobsize) {
|
||||
- struct mtd_oob_ops ops;
|
||||
+ struct mtd_oob_ops ops = { };
|
||||
|
||||
ops.mode = MTD_OPS_PLACE_OOB;
|
||||
ops.len = 0;
|
||||
--- a/fs/jffs2/wbuf.c
|
||||
+++ b/fs/jffs2/wbuf.c
|
||||
@@ -1035,7 +1035,7 @@ int jffs2_check_oob_empty(struct jffs2_s
|
||||
{
|
||||
int i, ret;
|
||||
int cmlen = min_t(int, c->oobavail, OOB_CM_SIZE);
|
||||
- struct mtd_oob_ops ops;
|
||||
+ struct mtd_oob_ops ops = { };
|
||||
|
||||
ops.mode = MTD_OPS_AUTO_OOB;
|
||||
ops.ooblen = NR_OOB_SCAN_PAGES * c->oobavail;
|
||||
@@ -1076,7 +1076,7 @@ int jffs2_check_oob_empty(struct jffs2_s
|
||||
int jffs2_check_nand_cleanmarker(struct jffs2_sb_info *c,
|
||||
struct jffs2_eraseblock *jeb)
|
||||
{
|
||||
- struct mtd_oob_ops ops;
|
||||
+ struct mtd_oob_ops ops = { };
|
||||
int ret, cmlen = min_t(int, c->oobavail, OOB_CM_SIZE);
|
||||
|
||||
ops.mode = MTD_OPS_AUTO_OOB;
|
||||
@@ -1101,7 +1101,7 @@ int jffs2_write_nand_cleanmarker(struct
|
||||
struct jffs2_eraseblock *jeb)
|
||||
{
|
||||
int ret;
|
||||
- struct mtd_oob_ops ops;
|
||||
+ struct mtd_oob_ops ops = { };
|
||||
int cmlen = min_t(int, c->oobavail, OOB_CM_SIZE);
|
||||
|
||||
ops.mode = MTD_OPS_AUTO_OOB;
|
@ -0,0 +1,172 @@
|
||||
From 2ed18d818d1f7492172f8dd5904344c7d367e8ed Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Micha=C5=82=20K=C4=99pie=C5=84?= <kernel@kempniu.pl>
|
||||
Date: Wed, 29 Jun 2022 14:57:36 +0200
|
||||
Subject: [PATCH 3/4] mtd: add ECC error accounting for each read request
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
Extend struct mtd_req_stats with two new fields holding the number of
|
||||
corrected bitflips and uncorrectable errors detected during a read
|
||||
operation. This is a prerequisite for ultimately passing those counters
|
||||
to user space, where they can be useful to applications for making
|
||||
better-informed choices about moving data around.
|
||||
|
||||
Unlike 'max_bitflips' (which is set - in a common code path - to the
|
||||
return value of a function called while the MTD device's mutex is held),
|
||||
these counters have to be maintained in each MTD driver which defines
|
||||
the '_read_oob' callback because the statistics need to be calculated
|
||||
while the MTD device's mutex is held.
|
||||
|
||||
Suggested-by: Boris Brezillon <boris.brezillon@collabora.com>
|
||||
Signed-off-by: Michał Kępień <kernel@kempniu.pl>
|
||||
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
|
||||
Link: https://lore.kernel.org/linux-mtd/20220629125737.14418-4-kernel@kempniu.pl
|
||||
---
|
||||
drivers/mtd/devices/docg3.c | 8 ++++++++
|
||||
drivers/mtd/nand/onenand/onenand_base.c | 12 ++++++++++++
|
||||
drivers/mtd/nand/raw/nand_base.c | 10 ++++++++++
|
||||
drivers/mtd/nand/spi/core.c | 10 ++++++++++
|
||||
include/linux/mtd/mtd.h | 2 ++
|
||||
5 files changed, 42 insertions(+)
|
||||
|
||||
--- a/drivers/mtd/devices/docg3.c
|
||||
+++ b/drivers/mtd/devices/docg3.c
|
||||
@@ -871,6 +871,7 @@ static int doc_read_oob(struct mtd_info
|
||||
u8 *buf = ops->datbuf;
|
||||
size_t len, ooblen, nbdata, nboob;
|
||||
u8 hwecc[DOC_ECC_BCH_SIZE], eccconf1;
|
||||
+ struct mtd_ecc_stats old_stats;
|
||||
int max_bitflips = 0;
|
||||
|
||||
if (buf)
|
||||
@@ -895,6 +896,7 @@ static int doc_read_oob(struct mtd_info
|
||||
ret = 0;
|
||||
skip = from % DOC_LAYOUT_PAGE_SIZE;
|
||||
mutex_lock(&docg3->cascade->lock);
|
||||
+ old_stats = mtd->ecc_stats;
|
||||
while (ret >= 0 && (len > 0 || ooblen > 0)) {
|
||||
calc_block_sector(from - skip, &block0, &block1, &page, &ofs,
|
||||
docg3->reliable);
|
||||
@@ -966,6 +968,12 @@ static int doc_read_oob(struct mtd_info
|
||||
}
|
||||
|
||||
out:
|
||||
+ if (ops->stats) {
|
||||
+ ops->stats->uncorrectable_errors +=
|
||||
+ mtd->ecc_stats.failed - old_stats.failed;
|
||||
+ ops->stats->corrected_bitflips +=
|
||||
+ mtd->ecc_stats.corrected - old_stats.corrected;
|
||||
+ }
|
||||
mutex_unlock(&docg3->cascade->lock);
|
||||
return ret;
|
||||
err_in_read:
|
||||
--- a/drivers/mtd/nand/onenand/onenand_base.c
|
||||
+++ b/drivers/mtd/nand/onenand/onenand_base.c
|
||||
@@ -1440,6 +1440,7 @@ static int onenand_read_oob(struct mtd_i
|
||||
struct mtd_oob_ops *ops)
|
||||
{
|
||||
struct onenand_chip *this = mtd->priv;
|
||||
+ struct mtd_ecc_stats old_stats;
|
||||
int ret;
|
||||
|
||||
switch (ops->mode) {
|
||||
@@ -1453,12 +1454,23 @@ static int onenand_read_oob(struct mtd_i
|
||||
}
|
||||
|
||||
onenand_get_device(mtd, FL_READING);
|
||||
+
|
||||
+ old_stats = mtd->ecc_stats;
|
||||
+
|
||||
if (ops->datbuf)
|
||||
ret = ONENAND_IS_4KB_PAGE(this) ?
|
||||
onenand_mlc_read_ops_nolock(mtd, from, ops) :
|
||||
onenand_read_ops_nolock(mtd, from, ops);
|
||||
else
|
||||
ret = onenand_read_oob_nolock(mtd, from, ops);
|
||||
+
|
||||
+ if (ops->stats) {
|
||||
+ ops->stats->uncorrectable_errors +=
|
||||
+ mtd->ecc_stats.failed - old_stats.failed;
|
||||
+ ops->stats->corrected_bitflips +=
|
||||
+ mtd->ecc_stats.corrected - old_stats.corrected;
|
||||
+ }
|
||||
+
|
||||
onenand_release_device(mtd);
|
||||
|
||||
return ret;
|
||||
--- a/drivers/mtd/nand/raw/nand_base.c
|
||||
+++ b/drivers/mtd/nand/raw/nand_base.c
|
||||
@@ -3815,6 +3815,7 @@ static int nand_read_oob(struct mtd_info
|
||||
struct mtd_oob_ops *ops)
|
||||
{
|
||||
struct nand_chip *chip = mtd_to_nand(mtd);
|
||||
+ struct mtd_ecc_stats old_stats;
|
||||
int ret;
|
||||
|
||||
ops->retlen = 0;
|
||||
@@ -3826,11 +3827,20 @@ static int nand_read_oob(struct mtd_info
|
||||
|
||||
nand_get_device(chip);
|
||||
|
||||
+ old_stats = mtd->ecc_stats;
|
||||
+
|
||||
if (!ops->datbuf)
|
||||
ret = nand_do_read_oob(chip, from, ops);
|
||||
else
|
||||
ret = nand_do_read_ops(chip, from, ops);
|
||||
|
||||
+ if (ops->stats) {
|
||||
+ ops->stats->uncorrectable_errors +=
|
||||
+ mtd->ecc_stats.failed - old_stats.failed;
|
||||
+ ops->stats->corrected_bitflips +=
|
||||
+ mtd->ecc_stats.corrected - old_stats.corrected;
|
||||
+ }
|
||||
+
|
||||
nand_release_device(chip);
|
||||
return ret;
|
||||
}
|
||||
--- a/drivers/mtd/nand/spi/core.c
|
||||
+++ b/drivers/mtd/nand/spi/core.c
|
||||
@@ -629,6 +629,7 @@ static int spinand_mtd_read(struct mtd_i
|
||||
{
|
||||
struct spinand_device *spinand = mtd_to_spinand(mtd);
|
||||
struct nand_device *nand = mtd_to_nanddev(mtd);
|
||||
+ struct mtd_ecc_stats old_stats;
|
||||
unsigned int max_bitflips = 0;
|
||||
struct nand_io_iter iter;
|
||||
bool disable_ecc = false;
|
||||
@@ -640,6 +641,8 @@ static int spinand_mtd_read(struct mtd_i
|
||||
|
||||
mutex_lock(&spinand->lock);
|
||||
|
||||
+ old_stats = mtd->ecc_stats;
|
||||
+
|
||||
nanddev_io_for_each_page(nand, NAND_PAGE_READ, from, ops, &iter) {
|
||||
if (disable_ecc)
|
||||
iter.req.mode = MTD_OPS_RAW;
|
||||
@@ -662,6 +665,13 @@ static int spinand_mtd_read(struct mtd_i
|
||||
ops->oobretlen += iter.req.ooblen;
|
||||
}
|
||||
|
||||
+ if (ops->stats) {
|
||||
+ ops->stats->uncorrectable_errors +=
|
||||
+ mtd->ecc_stats.failed - old_stats.failed;
|
||||
+ ops->stats->corrected_bitflips +=
|
||||
+ mtd->ecc_stats.corrected - old_stats.corrected;
|
||||
+ }
|
||||
+
|
||||
mutex_unlock(&spinand->lock);
|
||||
|
||||
if (ecc_failed && !ret)
|
||||
--- a/include/linux/mtd/mtd.h
|
||||
+++ b/include/linux/mtd/mtd.h
|
||||
@@ -41,6 +41,8 @@ struct mtd_erase_region_info {
|
||||
};
|
||||
|
||||
struct mtd_req_stats {
|
||||
+ unsigned int uncorrectable_errors;
|
||||
+ unsigned int corrected_bitflips;
|
||||
unsigned int max_bitflips;
|
||||
};
|
||||
|
@ -0,0 +1,321 @@
|
||||
From 2c9745d36e04ac27161acd78514f647b9b587ad4 Mon Sep 17 00:00:00 2001
|
||||
From: =?UTF-8?q?Micha=C5=82=20K=C4=99pie=C5=84?= <kernel@kempniu.pl>
|
||||
Date: Wed, 29 Jun 2022 14:57:37 +0200
|
||||
Subject: [PATCH 4/4] mtdchar: add MEMREAD ioctl
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
User-space applications making use of MTD devices via /dev/mtd*
|
||||
character devices currently have limited capabilities for reading data:
|
||||
|
||||
- only deprecated methods of accessing OOB layout information exist,
|
||||
|
||||
- there is no way to explicitly specify MTD operation mode to use; it
|
||||
is auto-selected based on the MTD file mode (MTD_FILE_MODE_*) set
|
||||
for the character device; in particular, this prevents using
|
||||
MTD_OPS_AUTO_OOB for reads,
|
||||
|
||||
- all existing user-space interfaces which cause mtd_read() or
|
||||
mtd_read_oob() to be called (via mtdchar_read() and
|
||||
mtdchar_read_oob(), respectively) return success even when those
|
||||
functions return -EUCLEAN or -EBADMSG; this renders user-space
|
||||
applications using these interfaces unaware of any corrected
|
||||
bitflips or uncorrectable ECC errors detected during reads.
|
||||
|
||||
Note that the existing MEMWRITE ioctl allows the MTD operation mode to
|
||||
be explicitly set, allowing user-space applications to write page data
|
||||
and OOB data without requiring them to know anything about the OOB
|
||||
layout of the MTD device they are writing to (MTD_OPS_AUTO_OOB). Also,
|
||||
the MEMWRITE ioctl does not mangle the return value of mtd_write_oob().
|
||||
|
||||
Add a new ioctl, MEMREAD, which addresses the above issues. It is
|
||||
intended to be a read-side counterpart of the existing MEMWRITE ioctl.
|
||||
Similarly to the latter, the read operation is performed in a loop which
|
||||
processes at most mtd->erasesize bytes in each iteration. This is done
|
||||
to prevent unbounded memory allocations caused by calling kmalloc() with
|
||||
the 'size' argument taken directly from the struct mtd_read_req provided
|
||||
by user space. However, the new ioctl is implemented so that the values
|
||||
it returns match those that would have been returned if just a single
|
||||
mtd_read_oob() call was issued to handle the entire read operation in
|
||||
one go.
|
||||
|
||||
Note that while just returning -EUCLEAN or -EBADMSG to user space would
|
||||
already be a valid and useful indication of the ECC algorithm detecting
|
||||
errors during a read operation, that signal would not be granular enough
|
||||
to cover all use cases. For example, knowing the maximum number of
|
||||
bitflips detected in a single ECC step during a read operation performed
|
||||
on a given page may be useful when dealing with an MTD partition whose
|
||||
ECC layout varies across pages (e.g. a partition consisting of a
|
||||
bootloader area using a "custom" ECC layout followed by data pages using
|
||||
a "standard" ECC layout). To address that, include ECC statistics in
|
||||
the structure returned to user space by the new MEMREAD ioctl.
|
||||
|
||||
Link: https://www.infradead.org/pipermail/linux-mtd/2016-April/067085.html
|
||||
|
||||
Suggested-by: Boris Brezillon <boris.brezillon@collabora.com>
|
||||
Signed-off-by: Michał Kępień <kernel@kempniu.pl>
|
||||
Acked-by: Richard Weinberger <richard@nod.at>
|
||||
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
|
||||
Link: https://lore.kernel.org/linux-mtd/20220629125737.14418-5-kernel@kempniu.pl
|
||||
---
|
||||
drivers/mtd/mtdchar.c | 139 +++++++++++++++++++++++++++++++++++++
|
||||
include/uapi/mtd/mtd-abi.h | 64 +++++++++++++++--
|
||||
2 files changed, 198 insertions(+), 5 deletions(-)
|
||||
|
||||
--- a/drivers/mtd/mtdchar.c
|
||||
+++ b/drivers/mtd/mtdchar.c
|
||||
@@ -621,6 +621,137 @@ static int mtdchar_write_ioctl(struct mt
|
||||
return ret;
|
||||
}
|
||||
|
||||
+static int mtdchar_read_ioctl(struct mtd_info *mtd,
|
||||
+ struct mtd_read_req __user *argp)
|
||||
+{
|
||||
+ struct mtd_info *master = mtd_get_master(mtd);
|
||||
+ struct mtd_read_req req;
|
||||
+ void __user *usr_data, *usr_oob;
|
||||
+ uint8_t *datbuf = NULL, *oobbuf = NULL;
|
||||
+ size_t datbuf_len, oobbuf_len;
|
||||
+ size_t orig_len, orig_ooblen;
|
||||
+ int ret = 0;
|
||||
+
|
||||
+ if (copy_from_user(&req, argp, sizeof(req)))
|
||||
+ return -EFAULT;
|
||||
+
|
||||
+ orig_len = req.len;
|
||||
+ orig_ooblen = req.ooblen;
|
||||
+
|
||||
+ usr_data = (void __user *)(uintptr_t)req.usr_data;
|
||||
+ usr_oob = (void __user *)(uintptr_t)req.usr_oob;
|
||||
+
|
||||
+ if (!master->_read_oob)
|
||||
+ return -EOPNOTSUPP;
|
||||
+
|
||||
+ if (!usr_data)
|
||||
+ req.len = 0;
|
||||
+
|
||||
+ if (!usr_oob)
|
||||
+ req.ooblen = 0;
|
||||
+
|
||||
+ req.ecc_stats.uncorrectable_errors = 0;
|
||||
+ req.ecc_stats.corrected_bitflips = 0;
|
||||
+ req.ecc_stats.max_bitflips = 0;
|
||||
+
|
||||
+ req.len &= 0xffffffff;
|
||||
+ req.ooblen &= 0xffffffff;
|
||||
+
|
||||
+ if (req.start + req.len > mtd->size) {
|
||||
+ ret = -EINVAL;
|
||||
+ goto out;
|
||||
+ }
|
||||
+
|
||||
+ datbuf_len = min_t(size_t, req.len, mtd->erasesize);
|
||||
+ if (datbuf_len > 0) {
|
||||
+ datbuf = kvmalloc(datbuf_len, GFP_KERNEL);
|
||||
+ if (!datbuf) {
|
||||
+ ret = -ENOMEM;
|
||||
+ goto out;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ oobbuf_len = min_t(size_t, req.ooblen, mtd->erasesize);
|
||||
+ if (oobbuf_len > 0) {
|
||||
+ oobbuf = kvmalloc(oobbuf_len, GFP_KERNEL);
|
||||
+ if (!oobbuf) {
|
||||
+ ret = -ENOMEM;
|
||||
+ goto out;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ while (req.len > 0 || (!usr_data && req.ooblen > 0)) {
|
||||
+ struct mtd_req_stats stats;
|
||||
+ struct mtd_oob_ops ops = {
|
||||
+ .mode = req.mode,
|
||||
+ .len = min_t(size_t, req.len, datbuf_len),
|
||||
+ .ooblen = min_t(size_t, req.ooblen, oobbuf_len),
|
||||
+ .datbuf = datbuf,
|
||||
+ .oobbuf = oobbuf,
|
||||
+ .stats = &stats,
|
||||
+ };
|
||||
+
|
||||
+ /*
|
||||
+ * Shorten non-page-aligned, eraseblock-sized reads so that the
|
||||
+ * read ends on an eraseblock boundary. This is necessary in
|
||||
+ * order to prevent OOB data for some pages from being
|
||||
+ * duplicated in the output of non-page-aligned reads requiring
|
||||
+ * multiple mtd_read_oob() calls to be completed.
|
||||
+ */
|
||||
+ if (ops.len == mtd->erasesize)
|
||||
+ ops.len -= mtd_mod_by_ws(req.start + ops.len, mtd);
|
||||
+
|
||||
+ ret = mtd_read_oob(mtd, (loff_t)req.start, &ops);
|
||||
+
|
||||
+ req.ecc_stats.uncorrectable_errors +=
|
||||
+ stats.uncorrectable_errors;
|
||||
+ req.ecc_stats.corrected_bitflips += stats.corrected_bitflips;
|
||||
+ req.ecc_stats.max_bitflips =
|
||||
+ max(req.ecc_stats.max_bitflips, stats.max_bitflips);
|
||||
+
|
||||
+ if (ret && !mtd_is_bitflip_or_eccerr(ret))
|
||||
+ break;
|
||||
+
|
||||
+ if (copy_to_user(usr_data, ops.datbuf, ops.retlen) ||
|
||||
+ copy_to_user(usr_oob, ops.oobbuf, ops.oobretlen)) {
|
||||
+ ret = -EFAULT;
|
||||
+ break;
|
||||
+ }
|
||||
+
|
||||
+ req.start += ops.retlen;
|
||||
+ req.len -= ops.retlen;
|
||||
+ usr_data += ops.retlen;
|
||||
+
|
||||
+ req.ooblen -= ops.oobretlen;
|
||||
+ usr_oob += ops.oobretlen;
|
||||
+ }
|
||||
+
|
||||
+ /*
|
||||
+ * As multiple iterations of the above loop (and therefore multiple
|
||||
+ * mtd_read_oob() calls) may be necessary to complete the read request,
|
||||
+ * adjust the final return code to ensure it accounts for all detected
|
||||
+ * ECC errors.
|
||||
+ */
|
||||
+ if (!ret || mtd_is_bitflip(ret)) {
|
||||
+ if (req.ecc_stats.uncorrectable_errors > 0)
|
||||
+ ret = -EBADMSG;
|
||||
+ else if (req.ecc_stats.corrected_bitflips > 0)
|
||||
+ ret = -EUCLEAN;
|
||||
+ }
|
||||
+
|
||||
+out:
|
||||
+ req.len = orig_len - req.len;
|
||||
+ req.ooblen = orig_ooblen - req.ooblen;
|
||||
+
|
||||
+ if (copy_to_user(argp, &req, sizeof(req)))
|
||||
+ ret = -EFAULT;
|
||||
+
|
||||
+ kvfree(datbuf);
|
||||
+ kvfree(oobbuf);
|
||||
+
|
||||
+ return ret;
|
||||
+}
|
||||
+
|
||||
static int mtdchar_ioctl(struct file *file, u_int cmd, u_long arg)
|
||||
{
|
||||
struct mtd_file_info *mfi = file->private_data;
|
||||
@@ -643,6 +774,7 @@ static int mtdchar_ioctl(struct file *fi
|
||||
case MEMGETINFO:
|
||||
case MEMREADOOB:
|
||||
case MEMREADOOB64:
|
||||
+ case MEMREAD:
|
||||
case MEMISLOCKED:
|
||||
case MEMGETOOBSEL:
|
||||
case MEMGETBADBLOCK:
|
||||
@@ -817,6 +949,13 @@ static int mtdchar_ioctl(struct file *fi
|
||||
break;
|
||||
}
|
||||
|
||||
+ case MEMREAD:
|
||||
+ {
|
||||
+ ret = mtdchar_read_ioctl(mtd,
|
||||
+ (struct mtd_read_req __user *)arg);
|
||||
+ break;
|
||||
+ }
|
||||
+
|
||||
case MEMLOCK:
|
||||
{
|
||||
struct erase_info_user einfo;
|
||||
--- a/include/uapi/mtd/mtd-abi.h
|
||||
+++ b/include/uapi/mtd/mtd-abi.h
|
||||
@@ -55,9 +55,9 @@ struct mtd_oob_buf64 {
|
||||
* @MTD_OPS_RAW: data are transferred as-is, with no error correction;
|
||||
* this mode implies %MTD_OPS_PLACE_OOB
|
||||
*
|
||||
- * These modes can be passed to ioctl(MEMWRITE) and are also used internally.
|
||||
- * See notes on "MTD file modes" for discussion on %MTD_OPS_RAW vs.
|
||||
- * %MTD_FILE_MODE_RAW.
|
||||
+ * These modes can be passed to ioctl(MEMWRITE) and ioctl(MEMREAD); they are
|
||||
+ * also used internally. See notes on "MTD file modes" for discussion on
|
||||
+ * %MTD_OPS_RAW vs. %MTD_FILE_MODE_RAW.
|
||||
*/
|
||||
enum {
|
||||
MTD_OPS_PLACE_OOB = 0,
|
||||
@@ -91,6 +91,53 @@ struct mtd_write_req {
|
||||
__u8 padding[7];
|
||||
};
|
||||
|
||||
+/**
|
||||
+ * struct mtd_read_req_ecc_stats - ECC statistics for a read operation
|
||||
+ *
|
||||
+ * @uncorrectable_errors: the number of uncorrectable errors that happened
|
||||
+ * during the read operation
|
||||
+ * @corrected_bitflips: the number of bitflips corrected during the read
|
||||
+ * operation
|
||||
+ * @max_bitflips: the maximum number of bitflips detected in any single ECC
|
||||
+ * step for the data read during the operation; this information
|
||||
+ * can be used to decide whether the data stored in a specific
|
||||
+ * region of the MTD device should be moved somewhere else to
|
||||
+ * avoid data loss.
|
||||
+ */
|
||||
+struct mtd_read_req_ecc_stats {
|
||||
+ __u32 uncorrectable_errors;
|
||||
+ __u32 corrected_bitflips;
|
||||
+ __u32 max_bitflips;
|
||||
+};
|
||||
+
|
||||
+/**
|
||||
+ * struct mtd_read_req - data structure for requesting a read operation
|
||||
+ *
|
||||
+ * @start: start address
|
||||
+ * @len: length of data buffer (only lower 32 bits are used)
|
||||
+ * @ooblen: length of OOB buffer (only lower 32 bits are used)
|
||||
+ * @usr_data: user-provided data buffer
|
||||
+ * @usr_oob: user-provided OOB buffer
|
||||
+ * @mode: MTD mode (see "MTD operation modes")
|
||||
+ * @padding: reserved, must be set to 0
|
||||
+ * @ecc_stats: ECC statistics for the read operation
|
||||
+ *
|
||||
+ * This structure supports ioctl(MEMREAD) operations, allowing data and/or OOB
|
||||
+ * reads in various modes. To read from OOB-only, set @usr_data == NULL, and to
|
||||
+ * read data-only, set @usr_oob == NULL. However, setting both @usr_data and
|
||||
+ * @usr_oob to NULL is not allowed.
|
||||
+ */
|
||||
+struct mtd_read_req {
|
||||
+ __u64 start;
|
||||
+ __u64 len;
|
||||
+ __u64 ooblen;
|
||||
+ __u64 usr_data;
|
||||
+ __u64 usr_oob;
|
||||
+ __u8 mode;
|
||||
+ __u8 padding[7];
|
||||
+ struct mtd_read_req_ecc_stats ecc_stats;
|
||||
+};
|
||||
+
|
||||
#define MTD_ABSENT 0
|
||||
#define MTD_RAM 1
|
||||
#define MTD_ROM 2
|
||||
@@ -207,6 +254,12 @@ struct otp_info {
|
||||
#define MEMWRITE _IOWR('M', 24, struct mtd_write_req)
|
||||
/* Erase a given range of user data (must be in mode %MTD_FILE_MODE_OTP_USER) */
|
||||
#define OTPERASE _IOW('M', 25, struct otp_info)
|
||||
+/*
|
||||
+ * Most generic read interface; can read in-band and/or out-of-band in various
|
||||
+ * modes (see "struct mtd_read_req"). This ioctl is not supported for flashes
|
||||
+ * without OOB, e.g., NOR flash.
|
||||
+ */
|
||||
+#define MEMREAD _IOWR('M', 26, struct mtd_read_req)
|
||||
|
||||
/*
|
||||
* Obsolete legacy interface. Keep it in order not to break userspace
|
||||
@@ -270,8 +323,9 @@ struct mtd_ecc_stats {
|
||||
* Note: %MTD_FILE_MODE_RAW provides the same functionality as %MTD_OPS_RAW -
|
||||
* raw access to the flash, without error correction or autoplacement schemes.
|
||||
* Wherever possible, the MTD_OPS_* mode will override the MTD_FILE_MODE_* mode
|
||||
- * (e.g., when using ioctl(MEMWRITE)), but in some cases, the MTD_FILE_MODE is
|
||||
- * used out of necessity (e.g., `write()', ioctl(MEMWRITEOOB64)).
|
||||
+ * (e.g., when using ioctl(MEMWRITE) or ioctl(MEMREAD)), but in some cases, the
|
||||
+ * MTD_FILE_MODE is used out of necessity (e.g., `write()',
|
||||
+ * ioctl(MEMWRITEOOB64)).
|
||||
*/
|
||||
enum mtd_file_modes {
|
||||
MTD_FILE_MODE_NORMAL = MTD_OTP_OFF,
|
@ -0,0 +1,35 @@
|
||||
From ebed787a0becb9354f0a23620a5130cccd6c730c Mon Sep 17 00:00:00 2001
|
||||
From: Daniel Golle <daniel@makrotopia.org>
|
||||
Date: Thu, 19 Jan 2023 03:45:43 +0000
|
||||
Subject: [PATCH] mtd: spinand: macronix: use scratch buffer for DMA operation
|
||||
|
||||
The mx35lf1ge4ab_get_eccsr() function uses an SPI DMA operation to
|
||||
read the eccsr, hence the buffer should not be on stack. Since commit
|
||||
380583227c0c7f ("spi: spi-mem: Add extra sanity checks on the op param")
|
||||
the kernel emmits a warning and blocks such operations.
|
||||
|
||||
Use the scratch buffer to get eccsr instead of trying to directly read
|
||||
into a stack-allocated variable.
|
||||
|
||||
Signed-off-by: Daniel Golle <daniel@makrotopia.org>
|
||||
Reviewed-by: Dhruva Gole <d-gole@ti.com>
|
||||
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
|
||||
Link: https://lore.kernel.org/linux-mtd/Y8i85zM0u4XdM46z@makrotopia.org
|
||||
---
|
||||
drivers/mtd/nand/spi/macronix.c | 3 ++-
|
||||
1 file changed, 2 insertions(+), 1 deletion(-)
|
||||
|
||||
--- a/drivers/mtd/nand/spi/macronix.c
|
||||
+++ b/drivers/mtd/nand/spi/macronix.c
|
||||
@@ -83,9 +83,10 @@ static int mx35lf1ge4ab_ecc_get_status(s
|
||||
* in order to avoid forcing the wear-leveling layer to move
|
||||
* data around if it's not necessary.
|
||||
*/
|
||||
- if (mx35lf1ge4ab_get_eccsr(spinand, &eccsr))
|
||||
+ if (mx35lf1ge4ab_get_eccsr(spinand, spinand->scratchbuf))
|
||||
return nanddev_get_ecc_conf(nand)->strength;
|
||||
|
||||
+ eccsr = *spinand->scratchbuf;
|
||||
if (WARN_ON(eccsr > nanddev_get_ecc_conf(nand)->strength ||
|
||||
!eccsr))
|
||||
return nanddev_get_ecc_conf(nand)->strength;
|
@ -0,0 +1,47 @@
|
||||
From 281f7a6c1a33fffcde32001bacbb4f672140fbf9 Mon Sep 17 00:00:00 2001
|
||||
From: Michael Walle <michael@walle.cc>
|
||||
Date: Wed, 8 Mar 2023 09:20:21 +0100
|
||||
Subject: [PATCH] mtd: core: prepare mtd_otp_nvmem_add() to handle
|
||||
-EPROBE_DEFER
|
||||
|
||||
NVMEM soon will get the ability for nvmem layouts and these might
|
||||
not be ready when nvmem_register() is called and thus it might
|
||||
return -EPROBE_DEFER. Don't print the error message in this case.
|
||||
|
||||
Signed-off-by: Michael Walle <michael@walle.cc>
|
||||
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
|
||||
Link: https://lore.kernel.org/linux-mtd/20230308082021.870459-4-michael@walle.cc
|
||||
---
|
||||
drivers/mtd/mtdcore.c | 7 +++----
|
||||
1 file changed, 3 insertions(+), 4 deletions(-)
|
||||
|
||||
--- a/drivers/mtd/mtdcore.c
|
||||
+++ b/drivers/mtd/mtdcore.c
|
||||
@@ -960,8 +960,8 @@ static int mtd_otp_nvmem_add(struct mtd_
|
||||
nvmem = mtd_otp_nvmem_register(mtd, "user-otp", size,
|
||||
mtd_nvmem_user_otp_reg_read);
|
||||
if (IS_ERR(nvmem)) {
|
||||
- dev_err(dev, "Failed to register OTP NVMEM device\n");
|
||||
- return PTR_ERR(nvmem);
|
||||
+ err = PTR_ERR(nvmem);
|
||||
+ goto err;
|
||||
}
|
||||
mtd->otp_user_nvmem = nvmem;
|
||||
}
|
||||
@@ -978,7 +978,6 @@ static int mtd_otp_nvmem_add(struct mtd_
|
||||
nvmem = mtd_otp_nvmem_register(mtd, "factory-otp", size,
|
||||
mtd_nvmem_fact_otp_reg_read);
|
||||
if (IS_ERR(nvmem)) {
|
||||
- dev_err(dev, "Failed to register OTP NVMEM device\n");
|
||||
err = PTR_ERR(nvmem);
|
||||
goto err;
|
||||
}
|
||||
@@ -991,7 +990,7 @@ static int mtd_otp_nvmem_add(struct mtd_
|
||||
err:
|
||||
if (mtd->otp_user_nvmem)
|
||||
nvmem_unregister(mtd->otp_user_nvmem);
|
||||
- return err;
|
||||
+ return dev_err_probe(dev, err, "Failed to register OTP NVMEM device\n");
|
||||
}
|
||||
|
||||
/**
|
@ -0,0 +1,165 @@
|
||||
From 8610037e8106b48c79cfe0afb92b2b2466e51c3d Mon Sep 17 00:00:00 2001
|
||||
From: Joe Damato <jdamato@fastly.com>
|
||||
Date: Tue, 1 Mar 2022 23:55:47 -0800
|
||||
Subject: [PATCH] page_pool: Add allocation stats
|
||||
|
||||
Add per-pool statistics counters for the allocation path of a page pool.
|
||||
These stats are incremented in softirq context, so no locking or per-cpu
|
||||
variables are needed.
|
||||
|
||||
This code is disabled by default and a kernel config option is provided for
|
||||
users who wish to enable them.
|
||||
|
||||
The statistics added are:
|
||||
- fast: successful fast path allocations
|
||||
- slow: slow path order-0 allocations
|
||||
- slow_high_order: slow path high order allocations
|
||||
- empty: ptr ring is empty, so a slow path allocation was forced.
|
||||
- refill: an allocation which triggered a refill of the cache
|
||||
- waive: pages obtained from the ptr ring that cannot be added to
|
||||
the cache due to a NUMA mismatch.
|
||||
|
||||
Signed-off-by: Joe Damato <jdamato@fastly.com>
|
||||
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
|
||||
Reviewed-by: Ilias Apalodimas <ilias.apalodimas@linaro.org>
|
||||
Signed-off-by: David S. Miller <davem@davemloft.net>
|
||||
---
|
||||
include/net/page_pool.h | 18 ++++++++++++++++++
|
||||
net/Kconfig | 13 +++++++++++++
|
||||
net/core/page_pool.c | 24 ++++++++++++++++++++----
|
||||
3 files changed, 51 insertions(+), 4 deletions(-)
|
||||
|
||||
--- a/include/net/page_pool.h
|
||||
+++ b/include/net/page_pool.h
|
||||
@@ -82,6 +82,19 @@ struct page_pool_params {
|
||||
unsigned int offset; /* DMA addr offset */
|
||||
};
|
||||
|
||||
+#ifdef CONFIG_PAGE_POOL_STATS
|
||||
+struct page_pool_alloc_stats {
|
||||
+ u64 fast; /* fast path allocations */
|
||||
+ u64 slow; /* slow-path order 0 allocations */
|
||||
+ u64 slow_high_order; /* slow-path high order allocations */
|
||||
+ u64 empty; /* failed refills due to empty ptr ring, forcing
|
||||
+ * slow path allocation
|
||||
+ */
|
||||
+ u64 refill; /* allocations via successful refill */
|
||||
+ u64 waive; /* failed refills due to numa zone mismatch */
|
||||
+};
|
||||
+#endif
|
||||
+
|
||||
struct page_pool {
|
||||
struct page_pool_params p;
|
||||
|
||||
@@ -132,6 +145,11 @@ struct page_pool {
|
||||
refcount_t user_cnt;
|
||||
|
||||
u64 destroy_cnt;
|
||||
+
|
||||
+#ifdef CONFIG_PAGE_POOL_STATS
|
||||
+ /* these stats are incremented while in softirq context */
|
||||
+ struct page_pool_alloc_stats alloc_stats;
|
||||
+#endif
|
||||
};
|
||||
|
||||
struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp);
|
||||
--- a/net/Kconfig
|
||||
+++ b/net/Kconfig
|
||||
@@ -434,6 +434,19 @@ config NET_DEVLINK
|
||||
config PAGE_POOL
|
||||
bool
|
||||
|
||||
+config PAGE_POOL_STATS
|
||||
+ default n
|
||||
+ bool "Page pool stats"
|
||||
+ depends on PAGE_POOL
|
||||
+ help
|
||||
+ Enable page pool statistics to track page allocation and recycling
|
||||
+ in page pools. This option incurs additional CPU cost in allocation
|
||||
+ and recycle paths and additional memory cost to store the statistics.
|
||||
+ These statistics are only available if this option is enabled and if
|
||||
+ the driver using the page pool supports exporting this data.
|
||||
+
|
||||
+ If unsure, say N.
|
||||
+
|
||||
config FAILOVER
|
||||
tristate "Generic failover module"
|
||||
help
|
||||
--- a/net/core/page_pool.c
|
||||
+++ b/net/core/page_pool.c
|
||||
@@ -26,6 +26,13 @@
|
||||
|
||||
#define BIAS_MAX LONG_MAX
|
||||
|
||||
+#ifdef CONFIG_PAGE_POOL_STATS
|
||||
+/* alloc_stat_inc is intended to be used in softirq context */
|
||||
+#define alloc_stat_inc(pool, __stat) (pool->alloc_stats.__stat++)
|
||||
+#else
|
||||
+#define alloc_stat_inc(pool, __stat)
|
||||
+#endif
|
||||
+
|
||||
static int page_pool_init(struct page_pool *pool,
|
||||
const struct page_pool_params *params)
|
||||
{
|
||||
@@ -117,8 +124,10 @@ static struct page *page_pool_refill_all
|
||||
int pref_nid; /* preferred NUMA node */
|
||||
|
||||
/* Quicker fallback, avoid locks when ring is empty */
|
||||
- if (__ptr_ring_empty(r))
|
||||
+ if (__ptr_ring_empty(r)) {
|
||||
+ alloc_stat_inc(pool, empty);
|
||||
return NULL;
|
||||
+ }
|
||||
|
||||
/* Softirq guarantee CPU and thus NUMA node is stable. This,
|
||||
* assumes CPU refilling driver RX-ring will also run RX-NAPI.
|
||||
@@ -148,14 +157,17 @@ static struct page *page_pool_refill_all
|
||||
* This limit stress on page buddy alloactor.
|
||||
*/
|
||||
page_pool_return_page(pool, page);
|
||||
+ alloc_stat_inc(pool, waive);
|
||||
page = NULL;
|
||||
break;
|
||||
}
|
||||
} while (pool->alloc.count < PP_ALLOC_CACHE_REFILL);
|
||||
|
||||
/* Return last page */
|
||||
- if (likely(pool->alloc.count > 0))
|
||||
+ if (likely(pool->alloc.count > 0)) {
|
||||
page = pool->alloc.cache[--pool->alloc.count];
|
||||
+ alloc_stat_inc(pool, refill);
|
||||
+ }
|
||||
|
||||
spin_unlock(&r->consumer_lock);
|
||||
return page;
|
||||
@@ -170,6 +182,7 @@ static struct page *__page_pool_get_cach
|
||||
if (likely(pool->alloc.count)) {
|
||||
/* Fast-path */
|
||||
page = pool->alloc.cache[--pool->alloc.count];
|
||||
+ alloc_stat_inc(pool, fast);
|
||||
} else {
|
||||
page = page_pool_refill_alloc_cache(pool);
|
||||
}
|
||||
@@ -241,6 +254,7 @@ static struct page *__page_pool_alloc_pa
|
||||
return NULL;
|
||||
}
|
||||
|
||||
+ alloc_stat_inc(pool, slow_high_order);
|
||||
page_pool_set_pp_info(pool, page);
|
||||
|
||||
/* Track how many pages are held 'in-flight' */
|
||||
@@ -295,10 +309,12 @@ static struct page *__page_pool_alloc_pa
|
||||
}
|
||||
|
||||
/* Return last page */
|
||||
- if (likely(pool->alloc.count > 0))
|
||||
+ if (likely(pool->alloc.count > 0)) {
|
||||
page = pool->alloc.cache[--pool->alloc.count];
|
||||
- else
|
||||
+ alloc_stat_inc(pool, slow);
|
||||
+ } else {
|
||||
page = NULL;
|
||||
+ }
|
||||
|
||||
/* When page just alloc'ed is should/must have refcnt 1. */
|
||||
return page;
|
@ -0,0 +1,140 @@
|
||||
From ad6fa1e1ab1b8164f1ba296b1b4dc556a483bcad Mon Sep 17 00:00:00 2001
|
||||
From: Joe Damato <jdamato@fastly.com>
|
||||
Date: Tue, 1 Mar 2022 23:55:48 -0800
|
||||
Subject: [PATCH 2/3] page_pool: Add recycle stats
|
||||
|
||||
Add per-cpu stats tracking page pool recycling events:
|
||||
- cached: recycling placed page in the page pool cache
|
||||
- cache_full: page pool cache was full
|
||||
- ring: page placed into the ptr ring
|
||||
- ring_full: page released from page pool because the ptr ring was full
|
||||
- released_refcnt: page released (and not recycled) because refcnt > 1
|
||||
|
||||
Signed-off-by: Joe Damato <jdamato@fastly.com>
|
||||
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
|
||||
Reviewed-by: Ilias Apalodimas <ilias.apalodimas@linaro.org>
|
||||
Signed-off-by: David S. Miller <davem@davemloft.net>
|
||||
---
|
||||
include/net/page_pool.h | 16 ++++++++++++++++
|
||||
net/core/page_pool.c | 30 ++++++++++++++++++++++++++++--
|
||||
2 files changed, 44 insertions(+), 2 deletions(-)
|
||||
|
||||
--- a/include/net/page_pool.h
|
||||
+++ b/include/net/page_pool.h
|
||||
@@ -93,6 +93,18 @@ struct page_pool_alloc_stats {
|
||||
u64 refill; /* allocations via successful refill */
|
||||
u64 waive; /* failed refills due to numa zone mismatch */
|
||||
};
|
||||
+
|
||||
+struct page_pool_recycle_stats {
|
||||
+ u64 cached; /* recycling placed page in the cache. */
|
||||
+ u64 cache_full; /* cache was full */
|
||||
+ u64 ring; /* recycling placed page back into ptr ring */
|
||||
+ u64 ring_full; /* page was released from page-pool because
|
||||
+ * PTR ring was full.
|
||||
+ */
|
||||
+ u64 released_refcnt; /* page released because of elevated
|
||||
+ * refcnt
|
||||
+ */
|
||||
+};
|
||||
#endif
|
||||
|
||||
struct page_pool {
|
||||
@@ -136,6 +148,10 @@ struct page_pool {
|
||||
*/
|
||||
struct ptr_ring ring;
|
||||
|
||||
+#ifdef CONFIG_PAGE_POOL_STATS
|
||||
+ /* recycle stats are per-cpu to avoid locking */
|
||||
+ struct page_pool_recycle_stats __percpu *recycle_stats;
|
||||
+#endif
|
||||
atomic_t pages_state_release_cnt;
|
||||
|
||||
/* A page_pool is strictly tied to a single RX-queue being
|
||||
--- a/net/core/page_pool.c
|
||||
+++ b/net/core/page_pool.c
|
||||
@@ -29,8 +29,15 @@
|
||||
#ifdef CONFIG_PAGE_POOL_STATS
|
||||
/* alloc_stat_inc is intended to be used in softirq context */
|
||||
#define alloc_stat_inc(pool, __stat) (pool->alloc_stats.__stat++)
|
||||
+/* recycle_stat_inc is safe to use when preemption is possible. */
|
||||
+#define recycle_stat_inc(pool, __stat) \
|
||||
+ do { \
|
||||
+ struct page_pool_recycle_stats __percpu *s = pool->recycle_stats; \
|
||||
+ this_cpu_inc(s->__stat); \
|
||||
+ } while (0)
|
||||
#else
|
||||
#define alloc_stat_inc(pool, __stat)
|
||||
+#define recycle_stat_inc(pool, __stat)
|
||||
#endif
|
||||
|
||||
static int page_pool_init(struct page_pool *pool,
|
||||
@@ -80,6 +87,12 @@ static int page_pool_init(struct page_po
|
||||
pool->p.flags & PP_FLAG_PAGE_FRAG)
|
||||
return -EINVAL;
|
||||
|
||||
+#ifdef CONFIG_PAGE_POOL_STATS
|
||||
+ pool->recycle_stats = alloc_percpu(struct page_pool_recycle_stats);
|
||||
+ if (!pool->recycle_stats)
|
||||
+ return -ENOMEM;
|
||||
+#endif
|
||||
+
|
||||
if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0)
|
||||
return -ENOMEM;
|
||||
|
||||
@@ -412,7 +425,12 @@ static bool page_pool_recycle_in_ring(st
|
||||
else
|
||||
ret = ptr_ring_produce_bh(&pool->ring, page);
|
||||
|
||||
- return (ret == 0) ? true : false;
|
||||
+ if (!ret) {
|
||||
+ recycle_stat_inc(pool, ring);
|
||||
+ return true;
|
||||
+ }
|
||||
+
|
||||
+ return false;
|
||||
}
|
||||
|
||||
/* Only allow direct recycling in special circumstances, into the
|
||||
@@ -423,11 +441,14 @@ static bool page_pool_recycle_in_ring(st
|
||||
static bool page_pool_recycle_in_cache(struct page *page,
|
||||
struct page_pool *pool)
|
||||
{
|
||||
- if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE))
|
||||
+ if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE)) {
|
||||
+ recycle_stat_inc(pool, cache_full);
|
||||
return false;
|
||||
+ }
|
||||
|
||||
/* Caller MUST have verified/know (page_ref_count(page) == 1) */
|
||||
pool->alloc.cache[pool->alloc.count++] = page;
|
||||
+ recycle_stat_inc(pool, cached);
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -482,6 +503,7 @@ __page_pool_put_page(struct page_pool *p
|
||||
* doing refcnt based recycle tricks, meaning another process
|
||||
* will be invoking put_page.
|
||||
*/
|
||||
+ recycle_stat_inc(pool, released_refcnt);
|
||||
/* Do not replace this with page_pool_return_page() */
|
||||
page_pool_release_page(pool, page);
|
||||
put_page(page);
|
||||
@@ -495,6 +517,7 @@ void page_pool_put_page(struct page_pool
|
||||
page = __page_pool_put_page(pool, page, dma_sync_size, allow_direct);
|
||||
if (page && !page_pool_recycle_in_ring(pool, page)) {
|
||||
/* Cache full, fallback to free pages */
|
||||
+ recycle_stat_inc(pool, ring_full);
|
||||
page_pool_return_page(pool, page);
|
||||
}
|
||||
}
|
||||
@@ -641,6 +664,9 @@ static void page_pool_free(struct page_p
|
||||
if (pool->p.flags & PP_FLAG_DMA_MAP)
|
||||
put_device(pool->p.dev);
|
||||
|
||||
+#ifdef CONFIG_PAGE_POOL_STATS
|
||||
+ free_percpu(pool->recycle_stats);
|
||||
+#endif
|
||||
kfree(pool);
|
||||
}
|
||||
|
@ -0,0 +1,77 @@
|
||||
From 6b95e3388b1ea0ca63500c5a6e39162dbf828433 Mon Sep 17 00:00:00 2001
|
||||
From: Joe Damato <jdamato@fastly.com>
|
||||
Date: Tue, 1 Mar 2022 23:55:49 -0800
|
||||
Subject: [PATCH 3/3] page_pool: Add function to batch and return stats
|
||||
|
||||
Adds a function page_pool_get_stats which can be used by drivers to obtain
|
||||
stats for a specified page_pool.
|
||||
|
||||
Signed-off-by: Joe Damato <jdamato@fastly.com>
|
||||
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
|
||||
Reviewed-by: Ilias Apalodimas <ilias.apalodimas@linaro.org>
|
||||
Signed-off-by: David S. Miller <davem@davemloft.net>
|
||||
---
|
||||
include/net/page_pool.h | 17 +++++++++++++++++
|
||||
net/core/page_pool.c | 25 +++++++++++++++++++++++++
|
||||
2 files changed, 42 insertions(+)
|
||||
|
||||
--- a/include/net/page_pool.h
|
||||
+++ b/include/net/page_pool.h
|
||||
@@ -105,6 +105,23 @@ struct page_pool_recycle_stats {
|
||||
* refcnt
|
||||
*/
|
||||
};
|
||||
+
|
||||
+/* This struct wraps the above stats structs so users of the
|
||||
+ * page_pool_get_stats API can pass a single argument when requesting the
|
||||
+ * stats for the page pool.
|
||||
+ */
|
||||
+struct page_pool_stats {
|
||||
+ struct page_pool_alloc_stats alloc_stats;
|
||||
+ struct page_pool_recycle_stats recycle_stats;
|
||||
+};
|
||||
+
|
||||
+/*
|
||||
+ * Drivers that wish to harvest page pool stats and report them to users
|
||||
+ * (perhaps via ethtool, debugfs, or another mechanism) can allocate a
|
||||
+ * struct page_pool_stats call page_pool_get_stats to get stats for the specified pool.
|
||||
+ */
|
||||
+bool page_pool_get_stats(struct page_pool *pool,
|
||||
+ struct page_pool_stats *stats);
|
||||
#endif
|
||||
|
||||
struct page_pool {
|
||||
--- a/net/core/page_pool.c
|
||||
+++ b/net/core/page_pool.c
|
||||
@@ -35,6 +35,31 @@
|
||||
struct page_pool_recycle_stats __percpu *s = pool->recycle_stats; \
|
||||
this_cpu_inc(s->__stat); \
|
||||
} while (0)
|
||||
+
|
||||
+bool page_pool_get_stats(struct page_pool *pool,
|
||||
+ struct page_pool_stats *stats)
|
||||
+{
|
||||
+ int cpu = 0;
|
||||
+
|
||||
+ if (!stats)
|
||||
+ return false;
|
||||
+
|
||||
+ memcpy(&stats->alloc_stats, &pool->alloc_stats, sizeof(pool->alloc_stats));
|
||||
+
|
||||
+ for_each_possible_cpu(cpu) {
|
||||
+ const struct page_pool_recycle_stats *pcpu =
|
||||
+ per_cpu_ptr(pool->recycle_stats, cpu);
|
||||
+
|
||||
+ stats->recycle_stats.cached += pcpu->cached;
|
||||
+ stats->recycle_stats.cache_full += pcpu->cache_full;
|
||||
+ stats->recycle_stats.ring += pcpu->ring;
|
||||
+ stats->recycle_stats.ring_full += pcpu->ring_full;
|
||||
+ stats->recycle_stats.released_refcnt += pcpu->released_refcnt;
|
||||
+ }
|
||||
+
|
||||
+ return true;
|
||||
+}
|
||||
+EXPORT_SYMBOL(page_pool_get_stats);
|
||||
#else
|
||||
#define alloc_stat_inc(pool, __stat)
|
||||
#define recycle_stat_inc(pool, __stat)
|
@ -0,0 +1,55 @@
|
||||
From 590032a4d2133ecc10d3078a8db1d85a4842f12c Mon Sep 17 00:00:00 2001
|
||||
From: Lorenzo Bianconi <lorenzo@kernel.org>
|
||||
Date: Mon, 11 Apr 2022 16:05:26 +0200
|
||||
Subject: [PATCH] page_pool: Add recycle stats to page_pool_put_page_bulk
|
||||
|
||||
Add missing recycle stats to page_pool_put_page_bulk routine.
|
||||
|
||||
Reviewed-by: Joe Damato <jdamato@fastly.com>
|
||||
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
|
||||
Reviewed-by: Ilias Apalodimas <ilias.apalodimas@linaro.org>
|
||||
Link: https://lore.kernel.org/r/3712178b51c007cfaed910ea80e68f00c916b1fa.1649685634.git.lorenzo@kernel.org
|
||||
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
|
||||
---
|
||||
net/core/page_pool.c | 15 +++++++++++++--
|
||||
1 file changed, 13 insertions(+), 2 deletions(-)
|
||||
|
||||
--- a/net/core/page_pool.c
|
||||
+++ b/net/core/page_pool.c
|
||||
@@ -36,6 +36,12 @@
|
||||
this_cpu_inc(s->__stat); \
|
||||
} while (0)
|
||||
|
||||
+#define recycle_stat_add(pool, __stat, val) \
|
||||
+ do { \
|
||||
+ struct page_pool_recycle_stats __percpu *s = pool->recycle_stats; \
|
||||
+ this_cpu_add(s->__stat, val); \
|
||||
+ } while (0)
|
||||
+
|
||||
bool page_pool_get_stats(struct page_pool *pool,
|
||||
struct page_pool_stats *stats)
|
||||
{
|
||||
@@ -63,6 +69,7 @@ EXPORT_SYMBOL(page_pool_get_stats);
|
||||
#else
|
||||
#define alloc_stat_inc(pool, __stat)
|
||||
#define recycle_stat_inc(pool, __stat)
|
||||
+#define recycle_stat_add(pool, __stat, val)
|
||||
#endif
|
||||
|
||||
static int page_pool_init(struct page_pool *pool,
|
||||
@@ -569,9 +576,13 @@ void page_pool_put_page_bulk(struct page
|
||||
/* Bulk producer into ptr_ring page_pool cache */
|
||||
page_pool_ring_lock(pool);
|
||||
for (i = 0; i < bulk_len; i++) {
|
||||
- if (__ptr_ring_produce(&pool->ring, data[i]))
|
||||
- break; /* ring full */
|
||||
+ if (__ptr_ring_produce(&pool->ring, data[i])) {
|
||||
+ /* ring full */
|
||||
+ recycle_stat_inc(pool, ring_full);
|
||||
+ break;
|
||||
+ }
|
||||
}
|
||||
+ recycle_stat_add(pool, ring, i);
|
||||
page_pool_ring_unlock(pool);
|
||||
|
||||
/* Hopefully all pages was return into ptr_ring */
|
@ -0,0 +1,147 @@
|
||||
From f3c5264f452a5b0ac1de1f2f657efbabdea3c76a Mon Sep 17 00:00:00 2001
|
||||
From: Lorenzo Bianconi <lorenzo@kernel.org>
|
||||
Date: Tue, 12 Apr 2022 18:31:58 +0200
|
||||
Subject: [PATCH] net: page_pool: introduce ethtool stats
|
||||
|
||||
Introduce page_pool APIs to report stats through ethtool and reduce
|
||||
duplicated code in each driver.
|
||||
|
||||
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
|
||||
Reviewed-by: Jakub Kicinski <kuba@kernel.org>
|
||||
Reviewed-by: Ilias Apalodimas <ilias.apalodimas@linaro.org>
|
||||
Signed-off-by: David S. Miller <davem@davemloft.net>
|
||||
---
|
||||
include/net/page_pool.h | 21 ++++++++++++++
|
||||
net/core/page_pool.c | 63 ++++++++++++++++++++++++++++++++++++++++-
|
||||
2 files changed, 83 insertions(+), 1 deletion(-)
|
||||
|
||||
--- a/include/net/page_pool.h
|
||||
+++ b/include/net/page_pool.h
|
||||
@@ -115,6 +115,10 @@ struct page_pool_stats {
|
||||
struct page_pool_recycle_stats recycle_stats;
|
||||
};
|
||||
|
||||
+int page_pool_ethtool_stats_get_count(void);
|
||||
+u8 *page_pool_ethtool_stats_get_strings(u8 *data);
|
||||
+u64 *page_pool_ethtool_stats_get(u64 *data, void *stats);
|
||||
+
|
||||
/*
|
||||
* Drivers that wish to harvest page pool stats and report them to users
|
||||
* (perhaps via ethtool, debugfs, or another mechanism) can allocate a
|
||||
@@ -122,6 +126,23 @@ struct page_pool_stats {
|
||||
*/
|
||||
bool page_pool_get_stats(struct page_pool *pool,
|
||||
struct page_pool_stats *stats);
|
||||
+#else
|
||||
+
|
||||
+static inline int page_pool_ethtool_stats_get_count(void)
|
||||
+{
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
+static inline u8 *page_pool_ethtool_stats_get_strings(u8 *data)
|
||||
+{
|
||||
+ return data;
|
||||
+}
|
||||
+
|
||||
+static inline u64 *page_pool_ethtool_stats_get(u64 *data, void *stats)
|
||||
+{
|
||||
+ return data;
|
||||
+}
|
||||
+
|
||||
#endif
|
||||
|
||||
struct page_pool {
|
||||
--- a/net/core/page_pool.c
|
||||
+++ b/net/core/page_pool.c
|
||||
@@ -18,6 +18,7 @@
|
||||
#include <linux/page-flags.h>
|
||||
#include <linux/mm.h> /* for __put_page() */
|
||||
#include <linux/poison.h>
|
||||
+#include <linux/ethtool.h>
|
||||
|
||||
#include <trace/events/page_pool.h>
|
||||
|
||||
@@ -42,6 +43,20 @@
|
||||
this_cpu_add(s->__stat, val); \
|
||||
} while (0)
|
||||
|
||||
+static const char pp_stats[][ETH_GSTRING_LEN] = {
|
||||
+ "rx_pp_alloc_fast",
|
||||
+ "rx_pp_alloc_slow",
|
||||
+ "rx_pp_alloc_slow_ho",
|
||||
+ "rx_pp_alloc_empty",
|
||||
+ "rx_pp_alloc_refill",
|
||||
+ "rx_pp_alloc_waive",
|
||||
+ "rx_pp_recycle_cached",
|
||||
+ "rx_pp_recycle_cache_full",
|
||||
+ "rx_pp_recycle_ring",
|
||||
+ "rx_pp_recycle_ring_full",
|
||||
+ "rx_pp_recycle_released_ref",
|
||||
+};
|
||||
+
|
||||
bool page_pool_get_stats(struct page_pool *pool,
|
||||
struct page_pool_stats *stats)
|
||||
{
|
||||
@@ -50,7 +65,13 @@ bool page_pool_get_stats(struct page_poo
|
||||
if (!stats)
|
||||
return false;
|
||||
|
||||
- memcpy(&stats->alloc_stats, &pool->alloc_stats, sizeof(pool->alloc_stats));
|
||||
+ /* The caller is responsible to initialize stats. */
|
||||
+ stats->alloc_stats.fast += pool->alloc_stats.fast;
|
||||
+ stats->alloc_stats.slow += pool->alloc_stats.slow;
|
||||
+ stats->alloc_stats.slow_high_order += pool->alloc_stats.slow_high_order;
|
||||
+ stats->alloc_stats.empty += pool->alloc_stats.empty;
|
||||
+ stats->alloc_stats.refill += pool->alloc_stats.refill;
|
||||
+ stats->alloc_stats.waive += pool->alloc_stats.waive;
|
||||
|
||||
for_each_possible_cpu(cpu) {
|
||||
const struct page_pool_recycle_stats *pcpu =
|
||||
@@ -66,6 +87,46 @@ bool page_pool_get_stats(struct page_poo
|
||||
return true;
|
||||
}
|
||||
EXPORT_SYMBOL(page_pool_get_stats);
|
||||
+
|
||||
+u8 *page_pool_ethtool_stats_get_strings(u8 *data)
|
||||
+{
|
||||
+ int i;
|
||||
+
|
||||
+ for (i = 0; i < ARRAY_SIZE(pp_stats); i++) {
|
||||
+ memcpy(data, pp_stats[i], ETH_GSTRING_LEN);
|
||||
+ data += ETH_GSTRING_LEN;
|
||||
+ }
|
||||
+
|
||||
+ return data;
|
||||
+}
|
||||
+EXPORT_SYMBOL(page_pool_ethtool_stats_get_strings);
|
||||
+
|
||||
+int page_pool_ethtool_stats_get_count(void)
|
||||
+{
|
||||
+ return ARRAY_SIZE(pp_stats);
|
||||
+}
|
||||
+EXPORT_SYMBOL(page_pool_ethtool_stats_get_count);
|
||||
+
|
||||
+u64 *page_pool_ethtool_stats_get(u64 *data, void *stats)
|
||||
+{
|
||||
+ struct page_pool_stats *pool_stats = stats;
|
||||
+
|
||||
+ *data++ = pool_stats->alloc_stats.fast;
|
||||
+ *data++ = pool_stats->alloc_stats.slow;
|
||||
+ *data++ = pool_stats->alloc_stats.slow_high_order;
|
||||
+ *data++ = pool_stats->alloc_stats.empty;
|
||||
+ *data++ = pool_stats->alloc_stats.refill;
|
||||
+ *data++ = pool_stats->alloc_stats.waive;
|
||||
+ *data++ = pool_stats->recycle_stats.cached;
|
||||
+ *data++ = pool_stats->recycle_stats.cache_full;
|
||||
+ *data++ = pool_stats->recycle_stats.ring;
|
||||
+ *data++ = pool_stats->recycle_stats.ring_full;
|
||||
+ *data++ = pool_stats->recycle_stats.released_refcnt;
|
||||
+
|
||||
+ return data;
|
||||
+}
|
||||
+EXPORT_SYMBOL(page_pool_ethtool_stats_get);
|
||||
+
|
||||
#else
|
||||
#define alloc_stat_inc(pool, __stat)
|
||||
#define recycle_stat_inc(pool, __stat)
|
@ -0,0 +1,99 @@
|
||||
From 2e88d4ff03013937028f5397268b21e10cf68713 Mon Sep 17 00:00:00 2001
|
||||
From: Lorenzo Bianconi <lorenzo@kernel.org>
|
||||
Date: Fri, 21 Jan 2022 11:09:45 +0100
|
||||
Subject: [PATCH] xdp: introduce flags field in xdp_buff/xdp_frame
|
||||
|
||||
Introduce flags field in xdp_frame and xdp_buffer data structures
|
||||
to define additional buffer features. At the moment the only
|
||||
supported buffer feature is frags bit (XDP_FLAGS_HAS_FRAGS).
|
||||
frags bit is used to specify if this is a linear buffer
|
||||
(XDP_FLAGS_HAS_FRAGS not set) or a frags frame (XDP_FLAGS_HAS_FRAGS
|
||||
set). In the latter case the driver is expected to initialize the
|
||||
skb_shared_info structure at the end of the first buffer to link together
|
||||
subsequent buffers belonging to the same frame.
|
||||
|
||||
Acked-by: Toke Hoiland-Jorgensen <toke@redhat.com>
|
||||
Acked-by: John Fastabend <john.fastabend@gmail.com>
|
||||
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
|
||||
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
|
||||
Link: https://lore.kernel.org/r/e389f14f3a162c0a5bc6a2e1aa8dd01a90be117d.1642758637.git.lorenzo@kernel.org
|
||||
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
|
||||
---
|
||||
include/net/xdp.h | 29 +++++++++++++++++++++++++++++
|
||||
1 file changed, 29 insertions(+)
|
||||
|
||||
--- a/include/net/xdp.h
|
||||
+++ b/include/net/xdp.h
|
||||
@@ -66,6 +66,10 @@ struct xdp_txq_info {
|
||||
struct net_device *dev;
|
||||
};
|
||||
|
||||
+enum xdp_buff_flags {
|
||||
+ XDP_FLAGS_HAS_FRAGS = BIT(0), /* non-linear xdp buff */
|
||||
+};
|
||||
+
|
||||
struct xdp_buff {
|
||||
void *data;
|
||||
void *data_end;
|
||||
@@ -74,13 +78,30 @@ struct xdp_buff {
|
||||
struct xdp_rxq_info *rxq;
|
||||
struct xdp_txq_info *txq;
|
||||
u32 frame_sz; /* frame size to deduce data_hard_end/reserved tailroom*/
|
||||
+ u32 flags; /* supported values defined in xdp_buff_flags */
|
||||
};
|
||||
|
||||
+static __always_inline bool xdp_buff_has_frags(struct xdp_buff *xdp)
|
||||
+{
|
||||
+ return !!(xdp->flags & XDP_FLAGS_HAS_FRAGS);
|
||||
+}
|
||||
+
|
||||
+static __always_inline void xdp_buff_set_frags_flag(struct xdp_buff *xdp)
|
||||
+{
|
||||
+ xdp->flags |= XDP_FLAGS_HAS_FRAGS;
|
||||
+}
|
||||
+
|
||||
+static __always_inline void xdp_buff_clear_frags_flag(struct xdp_buff *xdp)
|
||||
+{
|
||||
+ xdp->flags &= ~XDP_FLAGS_HAS_FRAGS;
|
||||
+}
|
||||
+
|
||||
static __always_inline void
|
||||
xdp_init_buff(struct xdp_buff *xdp, u32 frame_sz, struct xdp_rxq_info *rxq)
|
||||
{
|
||||
xdp->frame_sz = frame_sz;
|
||||
xdp->rxq = rxq;
|
||||
+ xdp->flags = 0;
|
||||
}
|
||||
|
||||
static __always_inline void
|
||||
@@ -122,8 +143,14 @@ struct xdp_frame {
|
||||
*/
|
||||
struct xdp_mem_info mem;
|
||||
struct net_device *dev_rx; /* used by cpumap */
|
||||
+ u32 flags; /* supported values defined in xdp_buff_flags */
|
||||
};
|
||||
|
||||
+static __always_inline bool xdp_frame_has_frags(struct xdp_frame *frame)
|
||||
+{
|
||||
+ return !!(frame->flags & XDP_FLAGS_HAS_FRAGS);
|
||||
+}
|
||||
+
|
||||
#define XDP_BULK_QUEUE_SIZE 16
|
||||
struct xdp_frame_bulk {
|
||||
int count;
|
||||
@@ -180,6 +207,7 @@ void xdp_convert_frame_to_buff(struct xd
|
||||
xdp->data_end = frame->data + frame->len;
|
||||
xdp->data_meta = frame->data - frame->metasize;
|
||||
xdp->frame_sz = frame->frame_sz;
|
||||
+ xdp->flags = frame->flags;
|
||||
}
|
||||
|
||||
static inline
|
||||
@@ -206,6 +234,7 @@ int xdp_update_frame_from_buff(struct xd
|
||||
xdp_frame->headroom = headroom - sizeof(*xdp_frame);
|
||||
xdp_frame->metasize = metasize;
|
||||
xdp_frame->frame_sz = xdp->frame_sz;
|
||||
+ xdp_frame->flags = xdp->flags;
|
||||
|
||||
return 0;
|
||||
}
|
@ -0,0 +1,137 @@
|
||||
From 7c48cb0176c6d6d3b55029f7ff4ffa05faee6446 Mon Sep 17 00:00:00 2001
|
||||
From: Lorenzo Bianconi <lorenzo@kernel.org>
|
||||
Date: Fri, 21 Jan 2022 11:09:50 +0100
|
||||
Subject: [PATCH] xdp: add frags support to xdp_return_{buff/frame}
|
||||
|
||||
Take into account if the received xdp_buff/xdp_frame is non-linear
|
||||
recycling/returning the frame memory to the allocator or into
|
||||
xdp_frame_bulk.
|
||||
|
||||
Acked-by: Toke Hoiland-Jorgensen <toke@redhat.com>
|
||||
Acked-by: John Fastabend <john.fastabend@gmail.com>
|
||||
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
|
||||
Link: https://lore.kernel.org/r/a961069febc868508ce1bdf5e53a343eb4e57cb2.1642758637.git.lorenzo@kernel.org
|
||||
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
|
||||
---
|
||||
include/net/xdp.h | 18 ++++++++++++++--
|
||||
net/core/xdp.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++-
|
||||
2 files changed, 69 insertions(+), 3 deletions(-)
|
||||
|
||||
--- a/include/net/xdp.h
|
||||
+++ b/include/net/xdp.h
|
||||
@@ -275,10 +275,24 @@ void __xdp_release_frame(void *data, str
|
||||
static inline void xdp_release_frame(struct xdp_frame *xdpf)
|
||||
{
|
||||
struct xdp_mem_info *mem = &xdpf->mem;
|
||||
+ struct skb_shared_info *sinfo;
|
||||
+ int i;
|
||||
|
||||
/* Curr only page_pool needs this */
|
||||
- if (mem->type == MEM_TYPE_PAGE_POOL)
|
||||
- __xdp_release_frame(xdpf->data, mem);
|
||||
+ if (mem->type != MEM_TYPE_PAGE_POOL)
|
||||
+ return;
|
||||
+
|
||||
+ if (likely(!xdp_frame_has_frags(xdpf)))
|
||||
+ goto out;
|
||||
+
|
||||
+ sinfo = xdp_get_shared_info_from_frame(xdpf);
|
||||
+ for (i = 0; i < sinfo->nr_frags; i++) {
|
||||
+ struct page *page = skb_frag_page(&sinfo->frags[i]);
|
||||
+
|
||||
+ __xdp_release_frame(page_address(page), mem);
|
||||
+ }
|
||||
+out:
|
||||
+ __xdp_release_frame(xdpf->data, mem);
|
||||
}
|
||||
|
||||
int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq,
|
||||
--- a/net/core/xdp.c
|
||||
+++ b/net/core/xdp.c
|
||||
@@ -376,12 +376,38 @@ static void __xdp_return(void *data, str
|
||||
|
||||
void xdp_return_frame(struct xdp_frame *xdpf)
|
||||
{
|
||||
+ struct skb_shared_info *sinfo;
|
||||
+ int i;
|
||||
+
|
||||
+ if (likely(!xdp_frame_has_frags(xdpf)))
|
||||
+ goto out;
|
||||
+
|
||||
+ sinfo = xdp_get_shared_info_from_frame(xdpf);
|
||||
+ for (i = 0; i < sinfo->nr_frags; i++) {
|
||||
+ struct page *page = skb_frag_page(&sinfo->frags[i]);
|
||||
+
|
||||
+ __xdp_return(page_address(page), &xdpf->mem, false, NULL);
|
||||
+ }
|
||||
+out:
|
||||
__xdp_return(xdpf->data, &xdpf->mem, false, NULL);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(xdp_return_frame);
|
||||
|
||||
void xdp_return_frame_rx_napi(struct xdp_frame *xdpf)
|
||||
{
|
||||
+ struct skb_shared_info *sinfo;
|
||||
+ int i;
|
||||
+
|
||||
+ if (likely(!xdp_frame_has_frags(xdpf)))
|
||||
+ goto out;
|
||||
+
|
||||
+ sinfo = xdp_get_shared_info_from_frame(xdpf);
|
||||
+ for (i = 0; i < sinfo->nr_frags; i++) {
|
||||
+ struct page *page = skb_frag_page(&sinfo->frags[i]);
|
||||
+
|
||||
+ __xdp_return(page_address(page), &xdpf->mem, true, NULL);
|
||||
+ }
|
||||
+out:
|
||||
__xdp_return(xdpf->data, &xdpf->mem, true, NULL);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(xdp_return_frame_rx_napi);
|
||||
@@ -417,7 +443,7 @@ void xdp_return_frame_bulk(struct xdp_fr
|
||||
struct xdp_mem_allocator *xa;
|
||||
|
||||
if (mem->type != MEM_TYPE_PAGE_POOL) {
|
||||
- __xdp_return(xdpf->data, &xdpf->mem, false, NULL);
|
||||
+ xdp_return_frame(xdpf);
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -436,12 +462,38 @@ void xdp_return_frame_bulk(struct xdp_fr
|
||||
bq->xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params);
|
||||
}
|
||||
|
||||
+ if (unlikely(xdp_frame_has_frags(xdpf))) {
|
||||
+ struct skb_shared_info *sinfo;
|
||||
+ int i;
|
||||
+
|
||||
+ sinfo = xdp_get_shared_info_from_frame(xdpf);
|
||||
+ for (i = 0; i < sinfo->nr_frags; i++) {
|
||||
+ skb_frag_t *frag = &sinfo->frags[i];
|
||||
+
|
||||
+ bq->q[bq->count++] = skb_frag_address(frag);
|
||||
+ if (bq->count == XDP_BULK_QUEUE_SIZE)
|
||||
+ xdp_flush_frame_bulk(bq);
|
||||
+ }
|
||||
+ }
|
||||
bq->q[bq->count++] = xdpf->data;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(xdp_return_frame_bulk);
|
||||
|
||||
void xdp_return_buff(struct xdp_buff *xdp)
|
||||
{
|
||||
+ struct skb_shared_info *sinfo;
|
||||
+ int i;
|
||||
+
|
||||
+ if (likely(!xdp_buff_has_frags(xdp)))
|
||||
+ goto out;
|
||||
+
|
||||
+ sinfo = xdp_get_shared_info_from_buff(xdp);
|
||||
+ for (i = 0; i < sinfo->nr_frags; i++) {
|
||||
+ struct page *page = skb_frag_page(&sinfo->frags[i]);
|
||||
+
|
||||
+ __xdp_return(page_address(page), &xdp->rxq->mem, true, xdp);
|
||||
+ }
|
||||
+out:
|
||||
__xdp_return(xdp->data, &xdp->rxq->mem, true, xdp);
|
||||
}
|
||||
|
@ -0,0 +1,31 @@
|
||||
From d16697cb6261d4cc23422e6b1cb2759df8aa76d0 Mon Sep 17 00:00:00 2001
|
||||
From: Lorenzo Bianconi <lorenzo@kernel.org>
|
||||
Date: Fri, 21 Jan 2022 11:09:44 +0100
|
||||
Subject: [PATCH] net: skbuff: add size metadata to skb_shared_info for xdp
|
||||
|
||||
Introduce xdp_frags_size field in skb_shared_info data structure
|
||||
to store xdp_buff/xdp_frame frame paged size (xdp_frags_size will
|
||||
be used in xdp frags support). In order to not increase
|
||||
skb_shared_info size we will use a hole due to skb_shared_info
|
||||
alignment.
|
||||
|
||||
Acked-by: Toke Hoiland-Jorgensen <toke@redhat.com>
|
||||
Acked-by: John Fastabend <john.fastabend@gmail.com>
|
||||
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
|
||||
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
|
||||
Link: https://lore.kernel.org/r/8a849819a3e0a143d540f78a3a5add76e17e980d.1642758637.git.lorenzo@kernel.org
|
||||
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
|
||||
---
|
||||
include/linux/skbuff.h | 1 +
|
||||
1 file changed, 1 insertion(+)
|
||||
|
||||
--- a/include/linux/skbuff.h
|
||||
+++ b/include/linux/skbuff.h
|
||||
@@ -568,6 +568,7 @@ struct skb_shared_info {
|
||||
* Warning : all fields before dataref are cleared in __alloc_skb()
|
||||
*/
|
||||
atomic_t dataref;
|
||||
+ unsigned int xdp_frags_size;
|
||||
|
||||
/* Intermediate layers must ensure that destructor_arg
|
||||
* remains valid until skb destructor */
|
@ -0,0 +1,65 @@
|
||||
From 5142239a22219921a7863cf00c9ab853c00689d8 Mon Sep 17 00:00:00 2001
|
||||
From: Lorenzo Bianconi <lorenzo@kernel.org>
|
||||
Date: Fri, 11 Mar 2022 10:14:18 +0100
|
||||
Subject: [PATCH] net: veth: Account total xdp_frame len running ndo_xdp_xmit
|
||||
|
||||
Even if this is a theoretical issue since it is not possible to perform
|
||||
XDP_REDIRECT on a non-linear xdp_frame, veth driver does not account
|
||||
paged area in ndo_xdp_xmit function pointer.
|
||||
Introduce xdp_get_frame_len utility routine to get the xdp_frame full
|
||||
length and account total frame size running XDP_REDIRECT of a
|
||||
non-linear xdp frame into a veth device.
|
||||
|
||||
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
|
||||
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
|
||||
Acked-by: Toke Hoiland-Jorgensen <toke@redhat.com>
|
||||
Acked-by: John Fastabend <john.fastabend@gmail.com>
|
||||
Link: https://lore.kernel.org/bpf/54f9fd3bb65d190daf2c0bbae2f852ff16cfbaa0.1646989407.git.lorenzo@kernel.org
|
||||
---
|
||||
drivers/net/veth.c | 4 ++--
|
||||
include/net/xdp.h | 14 ++++++++++++++
|
||||
2 files changed, 16 insertions(+), 2 deletions(-)
|
||||
|
||||
--- a/drivers/net/veth.c
|
||||
+++ b/drivers/net/veth.c
|
||||
@@ -501,7 +501,7 @@ static int veth_xdp_xmit(struct net_devi
|
||||
struct xdp_frame *frame = frames[i];
|
||||
void *ptr = veth_xdp_to_ptr(frame);
|
||||
|
||||
- if (unlikely(frame->len > max_len ||
|
||||
+ if (unlikely(xdp_get_frame_len(frame) > max_len ||
|
||||
__ptr_ring_produce(&rq->xdp_ring, ptr)))
|
||||
break;
|
||||
nxmit++;
|
||||
@@ -862,7 +862,7 @@ static int veth_xdp_rcv(struct veth_rq *
|
||||
/* ndo_xdp_xmit */
|
||||
struct xdp_frame *frame = veth_ptr_to_xdp(ptr);
|
||||
|
||||
- stats->xdp_bytes += frame->len;
|
||||
+ stats->xdp_bytes += xdp_get_frame_len(frame);
|
||||
frame = veth_xdp_rcv_one(rq, frame, bq, stats);
|
||||
if (frame) {
|
||||
/* XDP_PASS */
|
||||
--- a/include/net/xdp.h
|
||||
+++ b/include/net/xdp.h
|
||||
@@ -295,6 +295,20 @@ out:
|
||||
__xdp_release_frame(xdpf->data, mem);
|
||||
}
|
||||
|
||||
+static __always_inline unsigned int xdp_get_frame_len(struct xdp_frame *xdpf)
|
||||
+{
|
||||
+ struct skb_shared_info *sinfo;
|
||||
+ unsigned int len = xdpf->len;
|
||||
+
|
||||
+ if (likely(!xdp_frame_has_frags(xdpf)))
|
||||
+ goto out;
|
||||
+
|
||||
+ sinfo = xdp_get_shared_info_from_frame(xdpf);
|
||||
+ len += sinfo->xdp_frags_size;
|
||||
+out:
|
||||
+ return len;
|
||||
+}
|
||||
+
|
||||
int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq,
|
||||
struct net_device *dev, u32 queue_index, unsigned int napi_id);
|
||||
void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq);
|
@ -0,0 +1,40 @@
|
||||
From 7cda76d858a4e71ac4a04066c093679a12e1312c Mon Sep 17 00:00:00 2001
|
||||
From: Lorenzo Bianconi <lorenzo@kernel.org>
|
||||
Date: Fri, 11 Mar 2022 10:14:20 +0100
|
||||
Subject: [PATCH] veth: Allow jumbo frames in xdp mode
|
||||
MIME-Version: 1.0
|
||||
Content-Type: text/plain; charset=UTF-8
|
||||
Content-Transfer-Encoding: 8bit
|
||||
|
||||
Allow increasing the MTU over page boundaries on veth devices
|
||||
if the attached xdp program declares to support xdp fragments.
|
||||
|
||||
Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
|
||||
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
|
||||
Acked-by: Toke Høiland-Jørgensen <toke@redhat.com>
|
||||
Acked-by: John Fastabend <john.fastabend@gmail.com>
|
||||
Link: https://lore.kernel.org/bpf/d5dc039c3d4123426e7023a488c449181a7bc57f.1646989407.git.lorenzo@kernel.org
|
||||
---
|
||||
drivers/net/veth.c | 11 ++++++++---
|
||||
1 file changed, 8 insertions(+), 3 deletions(-)
|
||||
|
||||
--- a/drivers/net/veth.c
|
||||
+++ b/drivers/net/veth.c
|
||||
@@ -1471,9 +1471,14 @@ static int veth_xdp_set(struct net_devic
|
||||
goto err;
|
||||
}
|
||||
|
||||
- max_mtu = PAGE_SIZE - VETH_XDP_HEADROOM -
|
||||
- peer->hard_header_len -
|
||||
- SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
|
||||
+ max_mtu = SKB_WITH_OVERHEAD(PAGE_SIZE - VETH_XDP_HEADROOM) -
|
||||
+ peer->hard_header_len;
|
||||
+ /* Allow increasing the max_mtu if the program supports
|
||||
+ * XDP fragments.
|
||||
+ */
|
||||
+ //if (prog->aux->xdp_has_frags)
|
||||
+ max_mtu += PAGE_SIZE * MAX_SKB_FRAGS;
|
||||
+
|
||||
if (peer->mtu > max_mtu) {
|
||||
NL_SET_ERR_MSG_MOD(extack, "Peer MTU is too large to set XDP");
|
||||
err = -ERANGE;
|
@ -0,0 +1,56 @@
|
||||
From: Qingfang DENG <qingfang.deng@siflower.com.cn>
|
||||
Date: Fri, 3 Feb 2023 09:16:11 +0800
|
||||
Subject: [PATCH] net: page_pool: use in_softirq() instead
|
||||
|
||||
We use BH context only for synchronization, so we don't care if it's
|
||||
actually serving softirq or not.
|
||||
|
||||
As a side node, in case of threaded NAPI, in_serving_softirq() will
|
||||
return false because it's in process context with BH off, making
|
||||
page_pool_recycle_in_cache() unreachable.
|
||||
|
||||
Signed-off-by: Qingfang DENG <qingfang.deng@siflower.com.cn>
|
||||
---
|
||||
|
||||
--- a/include/net/page_pool.h
|
||||
+++ b/include/net/page_pool.h
|
||||
@@ -357,7 +357,7 @@ static inline void page_pool_nid_changed
|
||||
static inline void page_pool_ring_lock(struct page_pool *pool)
|
||||
__acquires(&pool->ring.producer_lock)
|
||||
{
|
||||
- if (in_serving_softirq())
|
||||
+ if (in_softirq())
|
||||
spin_lock(&pool->ring.producer_lock);
|
||||
else
|
||||
spin_lock_bh(&pool->ring.producer_lock);
|
||||
@@ -366,7 +366,7 @@ static inline void page_pool_ring_lock(s
|
||||
static inline void page_pool_ring_unlock(struct page_pool *pool)
|
||||
__releases(&pool->ring.producer_lock)
|
||||
{
|
||||
- if (in_serving_softirq())
|
||||
+ if (in_softirq())
|
||||
spin_unlock(&pool->ring.producer_lock);
|
||||
else
|
||||
spin_unlock_bh(&pool->ring.producer_lock);
|
||||
--- a/net/core/page_pool.c
|
||||
+++ b/net/core/page_pool.c
|
||||
@@ -512,8 +512,8 @@ static void page_pool_return_page(struct
|
||||
static bool page_pool_recycle_in_ring(struct page_pool *pool, struct page *page)
|
||||
{
|
||||
int ret;
|
||||
- /* BH protection not needed if current is serving softirq */
|
||||
- if (in_serving_softirq())
|
||||
+ /* BH protection not needed if current is softirq */
|
||||
+ if (in_softirq())
|
||||
ret = ptr_ring_produce(&pool->ring, page);
|
||||
else
|
||||
ret = ptr_ring_produce_bh(&pool->ring, page);
|
||||
@@ -576,7 +576,7 @@ __page_pool_put_page(struct page_pool *p
|
||||
page_pool_dma_sync_for_device(pool, page,
|
||||
dma_sync_size);
|
||||
|
||||
- if (allow_direct && in_serving_softirq() &&
|
||||
+ if (allow_direct && in_softirq() &&
|
||||
page_pool_recycle_in_cache(page, pool))
|
||||
return NULL;
|
||||
|
@ -0,0 +1,41 @@
|
||||
From 7390609b0121a1b982c5ecdfcd72dc328e5784ee Mon Sep 17 00:00:00 2001
|
||||
From: Michael Walle <michael@walle.cc>
|
||||
Date: Mon, 6 Feb 2023 13:43:42 +0000
|
||||
Subject: [PATCH] net: add helper eth_addr_add()
|
||||
|
||||
Add a helper to add an offset to a ethernet address. This comes in handy
|
||||
if you have a base ethernet address for multiple interfaces.
|
||||
|
||||
Signed-off-by: Michael Walle <michael@walle.cc>
|
||||
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
|
||||
Acked-by: Jakub Kicinski <kuba@kernel.org>
|
||||
Signed-off-by: Srinivas Kandagatla <srinivas.kandagatla@linaro.org>
|
||||
Link: https://lore.kernel.org/r/20230206134356.839737-9-srinivas.kandagatla@linaro.org
|
||||
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
|
||||
---
|
||||
include/linux/etherdevice.h | 14 ++++++++++++++
|
||||
1 file changed, 14 insertions(+)
|
||||
|
||||
--- a/include/linux/etherdevice.h
|
||||
+++ b/include/linux/etherdevice.h
|
||||
@@ -478,6 +478,20 @@ static inline void eth_addr_inc(u8 *addr
|
||||
}
|
||||
|
||||
/**
|
||||
+ * eth_addr_add() - Add (or subtract) an offset to/from the given MAC address.
|
||||
+ *
|
||||
+ * @offset: Offset to add.
|
||||
+ * @addr: Pointer to a six-byte array containing Ethernet address to increment.
|
||||
+ */
|
||||
+static inline void eth_addr_add(u8 *addr, long offset)
|
||||
+{
|
||||
+ u64 u = ether_addr_to_u64(addr);
|
||||
+
|
||||
+ u += offset;
|
||||
+ u64_to_ether_addr(u, addr);
|
||||
+}
|
||||
+
|
||||
+/**
|
||||
* is_etherdev_addr - Tell if given Ethernet address belongs to the device.
|
||||
* @dev: Pointer to a device structure
|
||||
* @addr: Pointer to a six-byte array containing the Ethernet address
|
@ -0,0 +1,279 @@
|
||||
From dc452a471dbae8aca8257c565174212620880093 Mon Sep 17 00:00:00 2001
|
||||
From: Vladimir Oltean <vladimir.oltean@nxp.com>
|
||||
Date: Fri, 10 Dec 2021 01:34:37 +0200
|
||||
Subject: net: dsa: introduce tagger-owned storage for private and shared data
|
||||
|
||||
Ansuel is working on register access over Ethernet for the qca8k switch
|
||||
family. This requires the qca8k tagging protocol driver to receive
|
||||
frames which aren't intended for the network stack, but instead for the
|
||||
qca8k switch driver itself.
|
||||
|
||||
The dp->priv is currently the prevailing method for passing data back
|
||||
and forth between the tagging protocol driver and the switch driver.
|
||||
However, this method is riddled with caveats.
|
||||
|
||||
The DSA design allows in principle for any switch driver to return any
|
||||
protocol it desires in ->get_tag_protocol(). The dsa_loop driver can be
|
||||
modified to do just that. But in the current design, the memory behind
|
||||
dp->priv has to be allocated by the switch driver, so if the tagging
|
||||
protocol is paired to an unexpected switch driver, we may end up in NULL
|
||||
pointer dereferences inside the kernel, or worse (a switch driver may
|
||||
allocate dp->priv according to the expectations of a different tagger).
|
||||
|
||||
The latter possibility is even more plausible considering that DSA
|
||||
switches can dynamically change tagging protocols in certain cases
|
||||
(dsa <-> edsa, ocelot <-> ocelot-8021q), and the current design lends
|
||||
itself to mistakes that are all too easy to make.
|
||||
|
||||
This patch proposes that the tagging protocol driver should manage its
|
||||
own memory, instead of relying on the switch driver to do so.
|
||||
After analyzing the different in-tree needs, it can be observed that the
|
||||
required tagger storage is per switch, therefore a ds->tagger_data
|
||||
pointer is introduced. In principle, per-port storage could also be
|
||||
introduced, although there is no need for it at the moment. Future
|
||||
changes will replace the current usage of dp->priv with ds->tagger_data.
|
||||
|
||||
We define a "binding" event between the DSA switch tree and the tagging
|
||||
protocol. During this binding event, the tagging protocol's ->connect()
|
||||
method is called first, and this may allocate some memory for each
|
||||
switch of the tree. Then a cross-chip notifier is emitted for the
|
||||
switches within that tree, and they are given the opportunity to fix up
|
||||
the tagger's memory (for example, they might set up some function
|
||||
pointers that represent virtual methods for consuming packets).
|
||||
Because the memory is owned by the tagger, there exists a ->disconnect()
|
||||
method for the tagger (which is the place to free the resources), but
|
||||
there doesn't exist a ->disconnect() method for the switch driver.
|
||||
This is part of the design. The switch driver should make minimal use of
|
||||
the public part of the tagger data, and only after type-checking it
|
||||
using the supplied "proto" argument.
|
||||
|
||||
In the code there are in fact two binding events, one is the initial
|
||||
event in dsa_switch_setup_tag_protocol(). At this stage, the cross chip
|
||||
notifier chains aren't initialized, so we call each switch's connect()
|
||||
method by hand. Then there is dsa_tree_bind_tag_proto() during
|
||||
dsa_tree_change_tag_proto(), and here we have an old protocol and a new
|
||||
one. We first connect to the new one before disconnecting from the old
|
||||
one, to simplify error handling a bit and to ensure we remain in a valid
|
||||
state at all times.
|
||||
|
||||
Co-developed-by: Ansuel Smith <ansuelsmth@gmail.com>
|
||||
Signed-off-by: Ansuel Smith <ansuelsmth@gmail.com>
|
||||
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
|
||||
Signed-off-by: David S. Miller <davem@davemloft.net>
|
||||
---
|
||||
include/net/dsa.h | 12 +++++++++
|
||||
net/dsa/dsa2.c | 73 +++++++++++++++++++++++++++++++++++++++++++++++++++---
|
||||
net/dsa/dsa_priv.h | 1 +
|
||||
net/dsa/switch.c | 14 +++++++++++
|
||||
4 files changed, 96 insertions(+), 4 deletions(-)
|
||||
|
||||
--- a/include/net/dsa.h
|
||||
+++ b/include/net/dsa.h
|
||||
@@ -80,12 +80,15 @@ enum dsa_tag_protocol {
|
||||
};
|
||||
|
||||
struct dsa_switch;
|
||||
+struct dsa_switch_tree;
|
||||
|
||||
struct dsa_device_ops {
|
||||
struct sk_buff *(*xmit)(struct sk_buff *skb, struct net_device *dev);
|
||||
struct sk_buff *(*rcv)(struct sk_buff *skb, struct net_device *dev);
|
||||
void (*flow_dissect)(const struct sk_buff *skb, __be16 *proto,
|
||||
int *offset);
|
||||
+ int (*connect)(struct dsa_switch_tree *dst);
|
||||
+ void (*disconnect)(struct dsa_switch_tree *dst);
|
||||
unsigned int needed_headroom;
|
||||
unsigned int needed_tailroom;
|
||||
const char *name;
|
||||
@@ -329,6 +332,8 @@ struct dsa_switch {
|
||||
*/
|
||||
void *priv;
|
||||
|
||||
+ void *tagger_data;
|
||||
+
|
||||
/*
|
||||
* Configuration data for this switch.
|
||||
*/
|
||||
@@ -584,6 +589,13 @@ struct dsa_switch_ops {
|
||||
enum dsa_tag_protocol mprot);
|
||||
int (*change_tag_protocol)(struct dsa_switch *ds, int port,
|
||||
enum dsa_tag_protocol proto);
|
||||
+ /*
|
||||
+ * Method for switch drivers to connect to the tagging protocol driver
|
||||
+ * in current use. The switch driver can provide handlers for certain
|
||||
+ * types of packets for switch management.
|
||||
+ */
|
||||
+ int (*connect_tag_protocol)(struct dsa_switch *ds,
|
||||
+ enum dsa_tag_protocol proto);
|
||||
|
||||
/* Optional switch-wide initialization and destruction methods */
|
||||
int (*setup)(struct dsa_switch *ds);
|
||||
--- a/net/dsa/dsa2.c
|
||||
+++ b/net/dsa/dsa2.c
|
||||
@@ -230,8 +230,12 @@ static struct dsa_switch_tree *dsa_tree_
|
||||
|
||||
static void dsa_tree_free(struct dsa_switch_tree *dst)
|
||||
{
|
||||
- if (dst->tag_ops)
|
||||
+ if (dst->tag_ops) {
|
||||
+ if (dst->tag_ops->disconnect)
|
||||
+ dst->tag_ops->disconnect(dst);
|
||||
+
|
||||
dsa_tag_driver_put(dst->tag_ops);
|
||||
+ }
|
||||
list_del(&dst->list);
|
||||
kfree(dst);
|
||||
}
|
||||
@@ -805,7 +809,7 @@ static int dsa_switch_setup_tag_protocol
|
||||
int port, err;
|
||||
|
||||
if (tag_ops->proto == dst->default_proto)
|
||||
- return 0;
|
||||
+ goto connect;
|
||||
|
||||
for (port = 0; port < ds->num_ports; port++) {
|
||||
if (!dsa_is_cpu_port(ds, port))
|
||||
@@ -821,6 +825,17 @@ static int dsa_switch_setup_tag_protocol
|
||||
}
|
||||
}
|
||||
|
||||
+connect:
|
||||
+ if (ds->ops->connect_tag_protocol) {
|
||||
+ err = ds->ops->connect_tag_protocol(ds, tag_ops->proto);
|
||||
+ if (err) {
|
||||
+ dev_err(ds->dev,
|
||||
+ "Unable to connect to tag protocol \"%s\": %pe\n",
|
||||
+ tag_ops->name, ERR_PTR(err));
|
||||
+ return err;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -1132,6 +1147,46 @@ static void dsa_tree_teardown(struct dsa
|
||||
dst->setup = false;
|
||||
}
|
||||
|
||||
+static int dsa_tree_bind_tag_proto(struct dsa_switch_tree *dst,
|
||||
+ const struct dsa_device_ops *tag_ops)
|
||||
+{
|
||||
+ const struct dsa_device_ops *old_tag_ops = dst->tag_ops;
|
||||
+ struct dsa_notifier_tag_proto_info info;
|
||||
+ int err;
|
||||
+
|
||||
+ dst->tag_ops = tag_ops;
|
||||
+
|
||||
+ /* Notify the new tagger about the connection to this tree */
|
||||
+ if (tag_ops->connect) {
|
||||
+ err = tag_ops->connect(dst);
|
||||
+ if (err)
|
||||
+ goto out_revert;
|
||||
+ }
|
||||
+
|
||||
+ /* Notify the switches from this tree about the connection
|
||||
+ * to the new tagger
|
||||
+ */
|
||||
+ info.tag_ops = tag_ops;
|
||||
+ err = dsa_tree_notify(dst, DSA_NOTIFIER_TAG_PROTO_CONNECT, &info);
|
||||
+ if (err && err != -EOPNOTSUPP)
|
||||
+ goto out_disconnect;
|
||||
+
|
||||
+ /* Notify the old tagger about the disconnection from this tree */
|
||||
+ if (old_tag_ops->disconnect)
|
||||
+ old_tag_ops->disconnect(dst);
|
||||
+
|
||||
+ return 0;
|
||||
+
|
||||
+out_disconnect:
|
||||
+ /* Revert the new tagger's connection to this tree */
|
||||
+ if (tag_ops->disconnect)
|
||||
+ tag_ops->disconnect(dst);
|
||||
+out_revert:
|
||||
+ dst->tag_ops = old_tag_ops;
|
||||
+
|
||||
+ return err;
|
||||
+}
|
||||
+
|
||||
/* Since the dsa/tagging sysfs device attribute is per master, the assumption
|
||||
* is that all DSA switches within a tree share the same tagger, otherwise
|
||||
* they would have formed disjoint trees (different "dsa,member" values).
|
||||
@@ -1164,12 +1219,15 @@ int dsa_tree_change_tag_proto(struct dsa
|
||||
goto out_unlock;
|
||||
}
|
||||
|
||||
+ /* Notify the tag protocol change */
|
||||
info.tag_ops = tag_ops;
|
||||
err = dsa_tree_notify(dst, DSA_NOTIFIER_TAG_PROTO, &info);
|
||||
if (err)
|
||||
- goto out_unwind_tagger;
|
||||
+ return err;
|
||||
|
||||
- dst->tag_ops = tag_ops;
|
||||
+ err = dsa_tree_bind_tag_proto(dst, tag_ops);
|
||||
+ if (err)
|
||||
+ goto out_unwind_tagger;
|
||||
|
||||
rtnl_unlock();
|
||||
|
||||
@@ -1257,6 +1315,7 @@ static int dsa_port_parse_cpu(struct dsa
|
||||
struct dsa_switch *ds = dp->ds;
|
||||
struct dsa_switch_tree *dst = ds->dst;
|
||||
enum dsa_tag_protocol default_proto;
|
||||
+ int err;
|
||||
|
||||
/* Find out which protocol the switch would prefer. */
|
||||
default_proto = dsa_get_tag_protocol(dp, master);
|
||||
@@ -1311,6 +1370,12 @@ static int dsa_port_parse_cpu(struct dsa
|
||||
*/
|
||||
dsa_tag_driver_put(tag_ops);
|
||||
} else {
|
||||
+ if (tag_ops->connect) {
|
||||
+ err = tag_ops->connect(dst);
|
||||
+ if (err)
|
||||
+ return err;
|
||||
+ }
|
||||
+
|
||||
dst->tag_ops = tag_ops;
|
||||
}
|
||||
|
||||
--- a/net/dsa/dsa_priv.h
|
||||
+++ b/net/dsa/dsa_priv.h
|
||||
@@ -37,6 +37,7 @@ enum {
|
||||
DSA_NOTIFIER_VLAN_DEL,
|
||||
DSA_NOTIFIER_MTU,
|
||||
DSA_NOTIFIER_TAG_PROTO,
|
||||
+ DSA_NOTIFIER_TAG_PROTO_CONNECT,
|
||||
DSA_NOTIFIER_MRP_ADD,
|
||||
DSA_NOTIFIER_MRP_DEL,
|
||||
DSA_NOTIFIER_MRP_ADD_RING_ROLE,
|
||||
--- a/net/dsa/switch.c
|
||||
+++ b/net/dsa/switch.c
|
||||
@@ -616,6 +616,17 @@ static int dsa_switch_change_tag_proto(s
|
||||
return 0;
|
||||
}
|
||||
|
||||
+static int dsa_switch_connect_tag_proto(struct dsa_switch *ds,
|
||||
+ struct dsa_notifier_tag_proto_info *info)
|
||||
+{
|
||||
+ const struct dsa_device_ops *tag_ops = info->tag_ops;
|
||||
+
|
||||
+ if (!ds->ops->connect_tag_protocol)
|
||||
+ return -EOPNOTSUPP;
|
||||
+
|
||||
+ return ds->ops->connect_tag_protocol(ds, tag_ops->proto);
|
||||
+}
|
||||
+
|
||||
static int dsa_switch_mrp_add(struct dsa_switch *ds,
|
||||
struct dsa_notifier_mrp_info *info)
|
||||
{
|
||||
@@ -735,6 +746,9 @@ static int dsa_switch_event(struct notif
|
||||
case DSA_NOTIFIER_TAG_PROTO:
|
||||
err = dsa_switch_change_tag_proto(ds, info);
|
||||
break;
|
||||
+ case DSA_NOTIFIER_TAG_PROTO_CONNECT:
|
||||
+ err = dsa_switch_connect_tag_proto(ds, info);
|
||||
+ break;
|
||||
case DSA_NOTIFIER_MRP_ADD:
|
||||
err = dsa_switch_mrp_add(ds, info);
|
||||
break;
|
@ -0,0 +1,274 @@
|
||||
From 7f2973149c22e7a6fee4c0c9fa6b8e4108e9c208 Mon Sep 17 00:00:00 2001
|
||||
From: Vladimir Oltean <vladimir.oltean@nxp.com>
|
||||
Date: Tue, 14 Dec 2021 03:45:36 +0200
|
||||
Subject: net: dsa: make tagging protocols connect to individual switches from
|
||||
a tree
|
||||
|
||||
On the NXP Bluebox 3 board which uses a multi-switch setup with sja1105,
|
||||
the mechanism through which the tagger connects to the switch tree is
|
||||
broken, due to improper DSA code design. At the time when tag_ops->connect()
|
||||
is called in dsa_port_parse_cpu(), DSA hasn't finished "touching" all
|
||||
the ports, so it doesn't know how large the tree is and how many ports
|
||||
it has. It has just seen the first CPU port by this time. As a result,
|
||||
this function will call the tagger's ->connect method too early, and the
|
||||
tagger will connect only to the first switch from the tree.
|
||||
|
||||
This could be perhaps addressed a bit more simply by just moving the
|
||||
tag_ops->connect(dst) call a bit later (for example in dsa_tree_setup),
|
||||
but there is already a design inconsistency at present: on the switch
|
||||
side, the notification is on a per-switch basis, but on the tagger side,
|
||||
it is on a per-tree basis. Furthermore, the persistent storage itself is
|
||||
per switch (ds->tagger_data). And the tagger connect and disconnect
|
||||
procedures (at least the ones that exist currently) could see a fair bit
|
||||
of simplification if they didn't have to iterate through the switches of
|
||||
a tree.
|
||||
|
||||
To fix the issue, this change transforms tag_ops->connect(dst) into
|
||||
tag_ops->connect(ds) and moves it somewhere where we already iterate
|
||||
over all switches of a tree. That is in dsa_switch_setup_tag_protocol(),
|
||||
which is a good placement because we already have there the connection
|
||||
call to the switch side of things.
|
||||
|
||||
As for the dsa_tree_bind_tag_proto() method (called from the code path
|
||||
that changes the tag protocol), things are a bit more complicated
|
||||
because we receive the tree as argument, yet when we unwind on errors,
|
||||
it would be nice to not call tag_ops->disconnect(ds) where we didn't
|
||||
previously call tag_ops->connect(ds). We didn't have this problem before
|
||||
because the tag_ops connection operations passed the entire dst before,
|
||||
and this is more fine grained now. To solve the error rewind case using
|
||||
the new API, we have to create yet one more cross-chip notifier for
|
||||
disconnection, and stay connected with the old tag protocol to all the
|
||||
switches in the tree until we've succeeded to connect with the new one
|
||||
as well. So if something fails half way, the whole tree is still
|
||||
connected to the old tagger. But there may still be leaks if the tagger
|
||||
fails to connect to the 2nd out of 3 switches in a tree: somebody needs
|
||||
to tell the tagger to disconnect from the first switch. Nothing comes
|
||||
for free, and this was previously handled privately by the tagging
|
||||
protocol driver before, but now we need to emit a disconnect cross-chip
|
||||
notifier for that, because DSA has to take care of the unwind path. We
|
||||
assume that the tagging protocol has connected to a switch if it has set
|
||||
ds->tagger_data to something, otherwise we avoid calling its
|
||||
disconnection method in the error rewind path.
|
||||
|
||||
The rest of the changes are in the tagging protocol drivers, and have to
|
||||
do with the replacement of dst with ds. The iteration is removed and the
|
||||
error unwind path is simplified, as mentioned above.
|
||||
|
||||
Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
|
||||
Signed-off-by: David S. Miller <davem@davemloft.net>
|
||||
---
|
||||
include/net/dsa.h | 5 ++--
|
||||
net/dsa/dsa2.c | 44 +++++++++++++-----------------
|
||||
net/dsa/dsa_priv.h | 1 +
|
||||
net/dsa/switch.c | 52 ++++++++++++++++++++++++++++++++---
|
||||
net/dsa/tag_ocelot_8021q.c | 53 +++++++++++-------------------------
|
||||
net/dsa/tag_sja1105.c | 67 ++++++++++++++++------------------------------
|
||||
6 files changed, 109 insertions(+), 113 deletions(-)
|
||||
|
||||
--- a/include/net/dsa.h
|
||||
+++ b/include/net/dsa.h
|
||||
@@ -80,15 +80,14 @@ enum dsa_tag_protocol {
|
||||
};
|
||||
|
||||
struct dsa_switch;
|
||||
-struct dsa_switch_tree;
|
||||
|
||||
struct dsa_device_ops {
|
||||
struct sk_buff *(*xmit)(struct sk_buff *skb, struct net_device *dev);
|
||||
struct sk_buff *(*rcv)(struct sk_buff *skb, struct net_device *dev);
|
||||
void (*flow_dissect)(const struct sk_buff *skb, __be16 *proto,
|
||||
int *offset);
|
||||
- int (*connect)(struct dsa_switch_tree *dst);
|
||||
- void (*disconnect)(struct dsa_switch_tree *dst);
|
||||
+ int (*connect)(struct dsa_switch *ds);
|
||||
+ void (*disconnect)(struct dsa_switch *ds);
|
||||
unsigned int needed_headroom;
|
||||
unsigned int needed_tailroom;
|
||||
const char *name;
|
||||
--- a/net/dsa/dsa2.c
|
||||
+++ b/net/dsa/dsa2.c
|
||||
@@ -230,12 +230,8 @@ static struct dsa_switch_tree *dsa_tree_
|
||||
|
||||
static void dsa_tree_free(struct dsa_switch_tree *dst)
|
||||
{
|
||||
- if (dst->tag_ops) {
|
||||
- if (dst->tag_ops->disconnect)
|
||||
- dst->tag_ops->disconnect(dst);
|
||||
-
|
||||
+ if (dst->tag_ops)
|
||||
dsa_tag_driver_put(dst->tag_ops);
|
||||
- }
|
||||
list_del(&dst->list);
|
||||
kfree(dst);
|
||||
}
|
||||
@@ -826,17 +822,29 @@ static int dsa_switch_setup_tag_protocol
|
||||
}
|
||||
|
||||
connect:
|
||||
+ if (tag_ops->connect) {
|
||||
+ err = tag_ops->connect(ds);
|
||||
+ if (err)
|
||||
+ return err;
|
||||
+ }
|
||||
+
|
||||
if (ds->ops->connect_tag_protocol) {
|
||||
err = ds->ops->connect_tag_protocol(ds, tag_ops->proto);
|
||||
if (err) {
|
||||
dev_err(ds->dev,
|
||||
"Unable to connect to tag protocol \"%s\": %pe\n",
|
||||
tag_ops->name, ERR_PTR(err));
|
||||
- return err;
|
||||
+ goto disconnect;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
+
|
||||
+disconnect:
|
||||
+ if (tag_ops->disconnect)
|
||||
+ tag_ops->disconnect(ds);
|
||||
+
|
||||
+ return err;
|
||||
}
|
||||
|
||||
static int dsa_switch_setup(struct dsa_switch *ds)
|
||||
@@ -1156,13 +1164,6 @@ static int dsa_tree_bind_tag_proto(struc
|
||||
|
||||
dst->tag_ops = tag_ops;
|
||||
|
||||
- /* Notify the new tagger about the connection to this tree */
|
||||
- if (tag_ops->connect) {
|
||||
- err = tag_ops->connect(dst);
|
||||
- if (err)
|
||||
- goto out_revert;
|
||||
- }
|
||||
-
|
||||
/* Notify the switches from this tree about the connection
|
||||
* to the new tagger
|
||||
*/
|
||||
@@ -1172,16 +1173,14 @@ static int dsa_tree_bind_tag_proto(struc
|
||||
goto out_disconnect;
|
||||
|
||||
/* Notify the old tagger about the disconnection from this tree */
|
||||
- if (old_tag_ops->disconnect)
|
||||
- old_tag_ops->disconnect(dst);
|
||||
+ info.tag_ops = old_tag_ops;
|
||||
+ dsa_tree_notify(dst, DSA_NOTIFIER_TAG_PROTO_DISCONNECT, &info);
|
||||
|
||||
return 0;
|
||||
|
||||
out_disconnect:
|
||||
- /* Revert the new tagger's connection to this tree */
|
||||
- if (tag_ops->disconnect)
|
||||
- tag_ops->disconnect(dst);
|
||||
-out_revert:
|
||||
+ info.tag_ops = tag_ops;
|
||||
+ dsa_tree_notify(dst, DSA_NOTIFIER_TAG_PROTO_DISCONNECT, &info);
|
||||
dst->tag_ops = old_tag_ops;
|
||||
|
||||
return err;
|
||||
@@ -1315,7 +1314,6 @@ static int dsa_port_parse_cpu(struct dsa
|
||||
struct dsa_switch *ds = dp->ds;
|
||||
struct dsa_switch_tree *dst = ds->dst;
|
||||
enum dsa_tag_protocol default_proto;
|
||||
- int err;
|
||||
|
||||
/* Find out which protocol the switch would prefer. */
|
||||
default_proto = dsa_get_tag_protocol(dp, master);
|
||||
@@ -1370,12 +1368,6 @@ static int dsa_port_parse_cpu(struct dsa
|
||||
*/
|
||||
dsa_tag_driver_put(tag_ops);
|
||||
} else {
|
||||
- if (tag_ops->connect) {
|
||||
- err = tag_ops->connect(dst);
|
||||
- if (err)
|
||||
- return err;
|
||||
- }
|
||||
-
|
||||
dst->tag_ops = tag_ops;
|
||||
}
|
||||
|
||||
--- a/net/dsa/dsa_priv.h
|
||||
+++ b/net/dsa/dsa_priv.h
|
||||
@@ -38,6 +38,7 @@ enum {
|
||||
DSA_NOTIFIER_MTU,
|
||||
DSA_NOTIFIER_TAG_PROTO,
|
||||
DSA_NOTIFIER_TAG_PROTO_CONNECT,
|
||||
+ DSA_NOTIFIER_TAG_PROTO_DISCONNECT,
|
||||
DSA_NOTIFIER_MRP_ADD,
|
||||
DSA_NOTIFIER_MRP_DEL,
|
||||
DSA_NOTIFIER_MRP_ADD_RING_ROLE,
|
||||
--- a/net/dsa/switch.c
|
||||
+++ b/net/dsa/switch.c
|
||||
@@ -616,15 +616,58 @@ static int dsa_switch_change_tag_proto(s
|
||||
return 0;
|
||||
}
|
||||
|
||||
-static int dsa_switch_connect_tag_proto(struct dsa_switch *ds,
|
||||
- struct dsa_notifier_tag_proto_info *info)
|
||||
+/* We use the same cross-chip notifiers to inform both the tagger side, as well
|
||||
+ * as the switch side, of connection and disconnection events.
|
||||
+ * Since ds->tagger_data is owned by the tagger, it isn't a hard error if the
|
||||
+ * switch side doesn't support connecting to this tagger, and therefore, the
|
||||
+ * fact that we don't disconnect the tagger side doesn't constitute a memory
|
||||
+ * leak: the tagger will still operate with persistent per-switch memory, just
|
||||
+ * with the switch side unconnected to it. What does constitute a hard error is
|
||||
+ * when the switch side supports connecting but fails.
|
||||
+ */
|
||||
+static int
|
||||
+dsa_switch_connect_tag_proto(struct dsa_switch *ds,
|
||||
+ struct dsa_notifier_tag_proto_info *info)
|
||||
{
|
||||
const struct dsa_device_ops *tag_ops = info->tag_ops;
|
||||
+ int err;
|
||||
+
|
||||
+ /* Notify the new tagger about the connection to this switch */
|
||||
+ if (tag_ops->connect) {
|
||||
+ err = tag_ops->connect(ds);
|
||||
+ if (err)
|
||||
+ return err;
|
||||
+ }
|
||||
|
||||
if (!ds->ops->connect_tag_protocol)
|
||||
return -EOPNOTSUPP;
|
||||
|
||||
- return ds->ops->connect_tag_protocol(ds, tag_ops->proto);
|
||||
+ /* Notify the switch about the connection to the new tagger */
|
||||
+ err = ds->ops->connect_tag_protocol(ds, tag_ops->proto);
|
||||
+ if (err) {
|
||||
+ /* Revert the new tagger's connection to this tree */
|
||||
+ if (tag_ops->disconnect)
|
||||
+ tag_ops->disconnect(ds);
|
||||
+ return err;
|
||||
+ }
|
||||
+
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
+static int
|
||||
+dsa_switch_disconnect_tag_proto(struct dsa_switch *ds,
|
||||
+ struct dsa_notifier_tag_proto_info *info)
|
||||
+{
|
||||
+ const struct dsa_device_ops *tag_ops = info->tag_ops;
|
||||
+
|
||||
+ /* Notify the tagger about the disconnection from this switch */
|
||||
+ if (tag_ops->disconnect && ds->tagger_data)
|
||||
+ tag_ops->disconnect(ds);
|
||||
+
|
||||
+ /* No need to notify the switch, since it shouldn't have any
|
||||
+ * resources to tear down
|
||||
+ */
|
||||
+ return 0;
|
||||
}
|
||||
|
||||
static int dsa_switch_mrp_add(struct dsa_switch *ds,
|
||||
@@ -749,6 +792,9 @@ static int dsa_switch_event(struct notif
|
||||
case DSA_NOTIFIER_TAG_PROTO_CONNECT:
|
||||
err = dsa_switch_connect_tag_proto(ds, info);
|
||||
break;
|
||||
+ case DSA_NOTIFIER_TAG_PROTO_DISCONNECT:
|
||||
+ err = dsa_switch_disconnect_tag_proto(ds, info);
|
||||
+ break;
|
||||
case DSA_NOTIFIER_MRP_ADD:
|
||||
err = dsa_switch_mrp_add(ds, info);
|
||||
break;
|
@ -0,0 +1,327 @@
|
||||
From: Felix Fietkau <nbd@nbd.name>
|
||||
Date: Sat, 5 Feb 2022 17:59:07 +0100
|
||||
Subject: [PATCH] net: ethernet: mtk_eth_soc: add support for coherent
|
||||
DMA
|
||||
|
||||
It improves performance by eliminating the need for a cache flush on rx and tx
|
||||
In preparation for supporting WED (Wireless Ethernet Dispatch), also add a
|
||||
function for disabling coherent DMA at runtime.
|
||||
|
||||
Signed-off-by: Felix Fietkau <nbd@nbd.name>
|
||||
---
|
||||
|
||||
--- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c
|
||||
+++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
|
||||
@@ -9,6 +9,7 @@
|
||||
#include <linux/of_device.h>
|
||||
#include <linux/of_mdio.h>
|
||||
#include <linux/of_net.h>
|
||||
+#include <linux/of_address.h>
|
||||
#include <linux/mfd/syscon.h>
|
||||
#include <linux/regmap.h>
|
||||
#include <linux/clk.h>
|
||||
@@ -840,7 +841,7 @@ static int mtk_init_fq_dma(struct mtk_et
|
||||
dma_addr_t dma_addr;
|
||||
int i;
|
||||
|
||||
- eth->scratch_ring = dma_alloc_coherent(eth->dev,
|
||||
+ eth->scratch_ring = dma_alloc_coherent(eth->dma_dev,
|
||||
cnt * sizeof(struct mtk_tx_dma),
|
||||
ð->phy_scratch_ring,
|
||||
GFP_ATOMIC);
|
||||
@@ -852,10 +853,10 @@ static int mtk_init_fq_dma(struct mtk_et
|
||||
if (unlikely(!eth->scratch_head))
|
||||
return -ENOMEM;
|
||||
|
||||
- dma_addr = dma_map_single(eth->dev,
|
||||
+ dma_addr = dma_map_single(eth->dma_dev,
|
||||
eth->scratch_head, cnt * MTK_QDMA_PAGE_SIZE,
|
||||
DMA_FROM_DEVICE);
|
||||
- if (unlikely(dma_mapping_error(eth->dev, dma_addr)))
|
||||
+ if (unlikely(dma_mapping_error(eth->dma_dev, dma_addr)))
|
||||
return -ENOMEM;
|
||||
|
||||
phy_ring_tail = eth->phy_scratch_ring +
|
||||
@@ -909,26 +910,26 @@ static void mtk_tx_unmap(struct mtk_eth
|
||||
{
|
||||
if (MTK_HAS_CAPS(eth->soc->caps, MTK_QDMA)) {
|
||||
if (tx_buf->flags & MTK_TX_FLAGS_SINGLE0) {
|
||||
- dma_unmap_single(eth->dev,
|
||||
+ dma_unmap_single(eth->dma_dev,
|
||||
dma_unmap_addr(tx_buf, dma_addr0),
|
||||
dma_unmap_len(tx_buf, dma_len0),
|
||||
DMA_TO_DEVICE);
|
||||
} else if (tx_buf->flags & MTK_TX_FLAGS_PAGE0) {
|
||||
- dma_unmap_page(eth->dev,
|
||||
+ dma_unmap_page(eth->dma_dev,
|
||||
dma_unmap_addr(tx_buf, dma_addr0),
|
||||
dma_unmap_len(tx_buf, dma_len0),
|
||||
DMA_TO_DEVICE);
|
||||
}
|
||||
} else {
|
||||
if (dma_unmap_len(tx_buf, dma_len0)) {
|
||||
- dma_unmap_page(eth->dev,
|
||||
+ dma_unmap_page(eth->dma_dev,
|
||||
dma_unmap_addr(tx_buf, dma_addr0),
|
||||
dma_unmap_len(tx_buf, dma_len0),
|
||||
DMA_TO_DEVICE);
|
||||
}
|
||||
|
||||
if (dma_unmap_len(tx_buf, dma_len1)) {
|
||||
- dma_unmap_page(eth->dev,
|
||||
+ dma_unmap_page(eth->dma_dev,
|
||||
dma_unmap_addr(tx_buf, dma_addr1),
|
||||
dma_unmap_len(tx_buf, dma_len1),
|
||||
DMA_TO_DEVICE);
|
||||
@@ -1006,9 +1007,9 @@ static int mtk_tx_map(struct sk_buff *sk
|
||||
if (skb_vlan_tag_present(skb))
|
||||
txd4 |= TX_DMA_INS_VLAN | skb_vlan_tag_get(skb);
|
||||
|
||||
- mapped_addr = dma_map_single(eth->dev, skb->data,
|
||||
+ mapped_addr = dma_map_single(eth->dma_dev, skb->data,
|
||||
skb_headlen(skb), DMA_TO_DEVICE);
|
||||
- if (unlikely(dma_mapping_error(eth->dev, mapped_addr)))
|
||||
+ if (unlikely(dma_mapping_error(eth->dma_dev, mapped_addr)))
|
||||
return -ENOMEM;
|
||||
|
||||
WRITE_ONCE(itxd->txd1, mapped_addr);
|
||||
@@ -1047,10 +1048,10 @@ static int mtk_tx_map(struct sk_buff *sk
|
||||
|
||||
|
||||
frag_map_size = min(frag_size, MTK_TX_DMA_BUF_LEN);
|
||||
- mapped_addr = skb_frag_dma_map(eth->dev, frag, offset,
|
||||
+ mapped_addr = skb_frag_dma_map(eth->dma_dev, frag, offset,
|
||||
frag_map_size,
|
||||
DMA_TO_DEVICE);
|
||||
- if (unlikely(dma_mapping_error(eth->dev, mapped_addr)))
|
||||
+ if (unlikely(dma_mapping_error(eth->dma_dev, mapped_addr)))
|
||||
goto err_dma;
|
||||
|
||||
if (i == nr_frags - 1 &&
|
||||
@@ -1331,18 +1332,18 @@ static int mtk_poll_rx(struct napi_struc
|
||||
netdev->stats.rx_dropped++;
|
||||
goto release_desc;
|
||||
}
|
||||
- dma_addr = dma_map_single(eth->dev,
|
||||
+ dma_addr = dma_map_single(eth->dma_dev,
|
||||
new_data + NET_SKB_PAD +
|
||||
eth->ip_align,
|
||||
ring->buf_size,
|
||||
DMA_FROM_DEVICE);
|
||||
- if (unlikely(dma_mapping_error(eth->dev, dma_addr))) {
|
||||
+ if (unlikely(dma_mapping_error(eth->dma_dev, dma_addr))) {
|
||||
skb_free_frag(new_data);
|
||||
netdev->stats.rx_dropped++;
|
||||
goto release_desc;
|
||||
}
|
||||
|
||||
- dma_unmap_single(eth->dev, trxd.rxd1,
|
||||
+ dma_unmap_single(eth->dma_dev, trxd.rxd1,
|
||||
ring->buf_size, DMA_FROM_DEVICE);
|
||||
|
||||
/* receive data */
|
||||
@@ -1615,7 +1616,7 @@ static int mtk_tx_alloc(struct mtk_eth *
|
||||
if (!ring->buf)
|
||||
goto no_tx_mem;
|
||||
|
||||
- ring->dma = dma_alloc_coherent(eth->dev, MTK_DMA_SIZE * sz,
|
||||
+ ring->dma = dma_alloc_coherent(eth->dma_dev, MTK_DMA_SIZE * sz,
|
||||
&ring->phys, GFP_ATOMIC);
|
||||
if (!ring->dma)
|
||||
goto no_tx_mem;
|
||||
@@ -1633,7 +1634,7 @@ static int mtk_tx_alloc(struct mtk_eth *
|
||||
* descriptors in ring->dma_pdma.
|
||||
*/
|
||||
if (!MTK_HAS_CAPS(eth->soc->caps, MTK_QDMA)) {
|
||||
- ring->dma_pdma = dma_alloc_coherent(eth->dev, MTK_DMA_SIZE * sz,
|
||||
+ ring->dma_pdma = dma_alloc_coherent(eth->dma_dev, MTK_DMA_SIZE * sz,
|
||||
&ring->phys_pdma,
|
||||
GFP_ATOMIC);
|
||||
if (!ring->dma_pdma)
|
||||
@@ -1692,7 +1693,7 @@ static void mtk_tx_clean(struct mtk_eth
|
||||
}
|
||||
|
||||
if (ring->dma) {
|
||||
- dma_free_coherent(eth->dev,
|
||||
+ dma_free_coherent(eth->dma_dev,
|
||||
MTK_DMA_SIZE * sizeof(*ring->dma),
|
||||
ring->dma,
|
||||
ring->phys);
|
||||
@@ -1700,7 +1701,7 @@ static void mtk_tx_clean(struct mtk_eth
|
||||
}
|
||||
|
||||
if (ring->dma_pdma) {
|
||||
- dma_free_coherent(eth->dev,
|
||||
+ dma_free_coherent(eth->dma_dev,
|
||||
MTK_DMA_SIZE * sizeof(*ring->dma_pdma),
|
||||
ring->dma_pdma,
|
||||
ring->phys_pdma);
|
||||
@@ -1748,18 +1749,18 @@ static int mtk_rx_alloc(struct mtk_eth *
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
- ring->dma = dma_alloc_coherent(eth->dev,
|
||||
+ ring->dma = dma_alloc_coherent(eth->dma_dev,
|
||||
rx_dma_size * sizeof(*ring->dma),
|
||||
&ring->phys, GFP_ATOMIC);
|
||||
if (!ring->dma)
|
||||
return -ENOMEM;
|
||||
|
||||
for (i = 0; i < rx_dma_size; i++) {
|
||||
- dma_addr_t dma_addr = dma_map_single(eth->dev,
|
||||
+ dma_addr_t dma_addr = dma_map_single(eth->dma_dev,
|
||||
ring->data[i] + NET_SKB_PAD + eth->ip_align,
|
||||
ring->buf_size,
|
||||
DMA_FROM_DEVICE);
|
||||
- if (unlikely(dma_mapping_error(eth->dev, dma_addr)))
|
||||
+ if (unlikely(dma_mapping_error(eth->dma_dev, dma_addr)))
|
||||
return -ENOMEM;
|
||||
ring->dma[i].rxd1 = (unsigned int)dma_addr;
|
||||
|
||||
@@ -1795,7 +1796,7 @@ static void mtk_rx_clean(struct mtk_eth
|
||||
continue;
|
||||
if (!ring->dma[i].rxd1)
|
||||
continue;
|
||||
- dma_unmap_single(eth->dev,
|
||||
+ dma_unmap_single(eth->dma_dev,
|
||||
ring->dma[i].rxd1,
|
||||
ring->buf_size,
|
||||
DMA_FROM_DEVICE);
|
||||
@@ -1806,7 +1807,7 @@ static void mtk_rx_clean(struct mtk_eth
|
||||
}
|
||||
|
||||
if (ring->dma) {
|
||||
- dma_free_coherent(eth->dev,
|
||||
+ dma_free_coherent(eth->dma_dev,
|
||||
ring->dma_size * sizeof(*ring->dma),
|
||||
ring->dma,
|
||||
ring->phys);
|
||||
@@ -2162,7 +2163,7 @@ static void mtk_dma_free(struct mtk_eth
|
||||
if (eth->netdev[i])
|
||||
netdev_reset_queue(eth->netdev[i]);
|
||||
if (eth->scratch_ring) {
|
||||
- dma_free_coherent(eth->dev,
|
||||
+ dma_free_coherent(eth->dma_dev,
|
||||
MTK_DMA_SIZE * sizeof(struct mtk_tx_dma),
|
||||
eth->scratch_ring,
|
||||
eth->phy_scratch_ring);
|
||||
@@ -2514,6 +2515,8 @@ static void mtk_dim_tx(struct work_struc
|
||||
|
||||
static int mtk_hw_init(struct mtk_eth *eth)
|
||||
{
|
||||
+ u32 dma_mask = ETHSYS_DMA_AG_MAP_PDMA | ETHSYS_DMA_AG_MAP_QDMA |
|
||||
+ ETHSYS_DMA_AG_MAP_PPE;
|
||||
int i, val, ret;
|
||||
|
||||
if (test_and_set_bit(MTK_HW_INIT, ð->state))
|
||||
@@ -2526,6 +2529,10 @@ static int mtk_hw_init(struct mtk_eth *e
|
||||
if (ret)
|
||||
goto err_disable_pm;
|
||||
|
||||
+ if (eth->ethsys)
|
||||
+ regmap_update_bits(eth->ethsys, ETHSYS_DMA_AG_MAP, dma_mask,
|
||||
+ of_dma_is_coherent(eth->dma_dev->of_node) * dma_mask);
|
||||
+
|
||||
if (MTK_HAS_CAPS(eth->soc->caps, MTK_SOC_MT7628)) {
|
||||
ret = device_reset(eth->dev);
|
||||
if (ret) {
|
||||
@@ -3079,6 +3086,35 @@ free_netdev:
|
||||
return err;
|
||||
}
|
||||
|
||||
+void mtk_eth_set_dma_device(struct mtk_eth *eth, struct device *dma_dev)
|
||||
+{
|
||||
+ struct net_device *dev, *tmp;
|
||||
+ LIST_HEAD(dev_list);
|
||||
+ int i;
|
||||
+
|
||||
+ rtnl_lock();
|
||||
+
|
||||
+ for (i = 0; i < MTK_MAC_COUNT; i++) {
|
||||
+ dev = eth->netdev[i];
|
||||
+
|
||||
+ if (!dev || !(dev->flags & IFF_UP))
|
||||
+ continue;
|
||||
+
|
||||
+ list_add_tail(&dev->close_list, &dev_list);
|
||||
+ }
|
||||
+
|
||||
+ dev_close_many(&dev_list, false);
|
||||
+
|
||||
+ eth->dma_dev = dma_dev;
|
||||
+
|
||||
+ list_for_each_entry_safe(dev, tmp, &dev_list, close_list) {
|
||||
+ list_del_init(&dev->close_list);
|
||||
+ dev_open(dev, NULL);
|
||||
+ }
|
||||
+
|
||||
+ rtnl_unlock();
|
||||
+}
|
||||
+
|
||||
static int mtk_probe(struct platform_device *pdev)
|
||||
{
|
||||
struct device_node *mac_np;
|
||||
@@ -3092,6 +3128,7 @@ static int mtk_probe(struct platform_dev
|
||||
eth->soc = of_device_get_match_data(&pdev->dev);
|
||||
|
||||
eth->dev = &pdev->dev;
|
||||
+ eth->dma_dev = &pdev->dev;
|
||||
eth->base = devm_platform_ioremap_resource(pdev, 0);
|
||||
if (IS_ERR(eth->base))
|
||||
return PTR_ERR(eth->base);
|
||||
@@ -3140,6 +3177,16 @@ static int mtk_probe(struct platform_dev
|
||||
}
|
||||
}
|
||||
|
||||
+ if (of_dma_is_coherent(pdev->dev.of_node)) {
|
||||
+ struct regmap *cci;
|
||||
+
|
||||
+ cci = syscon_regmap_lookup_by_phandle(pdev->dev.of_node,
|
||||
+ "mediatek,cci-control");
|
||||
+ /* enable CPU/bus coherency */
|
||||
+ if (!IS_ERR(cci))
|
||||
+ regmap_write(cci, 0, 3);
|
||||
+ }
|
||||
+
|
||||
if (MTK_HAS_CAPS(eth->soc->caps, MTK_SGMII)) {
|
||||
eth->sgmii = devm_kzalloc(eth->dev, sizeof(*eth->sgmii),
|
||||
GFP_KERNEL);
|
||||
--- a/drivers/net/ethernet/mediatek/mtk_eth_soc.h
|
||||
+++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.h
|
||||
@@ -463,6 +463,12 @@
|
||||
#define RSTCTRL_FE BIT(6)
|
||||
#define RSTCTRL_PPE BIT(31)
|
||||
|
||||
+/* ethernet dma channel agent map */
|
||||
+#define ETHSYS_DMA_AG_MAP 0x408
|
||||
+#define ETHSYS_DMA_AG_MAP_PDMA BIT(0)
|
||||
+#define ETHSYS_DMA_AG_MAP_QDMA BIT(1)
|
||||
+#define ETHSYS_DMA_AG_MAP_PPE BIT(2)
|
||||
+
|
||||
/* SGMII subsystem config registers */
|
||||
/* Register to auto-negotiation restart */
|
||||
#define SGMSYS_PCS_CONTROL_1 0x0
|
||||
@@ -880,6 +886,7 @@ struct mtk_sgmii {
|
||||
/* struct mtk_eth - This is the main datasructure for holding the state
|
||||
* of the driver
|
||||
* @dev: The device pointer
|
||||
+ * @dev: The device pointer used for dma mapping/alloc
|
||||
* @base: The mapped register i/o base
|
||||
* @page_lock: Make sure that register operations are atomic
|
||||
* @tx_irq__lock: Make sure that IRQ register operations are atomic
|
||||
@@ -923,6 +930,7 @@ struct mtk_sgmii {
|
||||
|
||||
struct mtk_eth {
|
||||
struct device *dev;
|
||||
+ struct device *dma_dev;
|
||||
void __iomem *base;
|
||||
spinlock_t page_lock;
|
||||
spinlock_t tx_irq_lock;
|
||||
@@ -1021,6 +1029,7 @@ int mtk_gmac_rgmii_path_setup(struct mtk
|
||||
int mtk_eth_offload_init(struct mtk_eth *eth);
|
||||
int mtk_eth_setup_tc(struct net_device *dev, enum tc_setup_type type,
|
||||
void *type_data);
|
||||
+void mtk_eth_set_dma_device(struct mtk_eth *eth, struct device *dma_dev);
|
||||
|
||||
|
||||
#endif /* MTK_ETH_H */
|
@ -0,0 +1,30 @@
|
||||
From: Felix Fietkau <nbd@nbd.name>
|
||||
Date: Mon, 7 Feb 2022 10:27:22 +0100
|
||||
Subject: [PATCH] arm64: dts: mediatek: mt7622: add support for coherent
|
||||
DMA
|
||||
|
||||
It improves performance by eliminating the need for a cache flush on rx and tx
|
||||
|
||||
Signed-off-by: Felix Fietkau <nbd@nbd.name>
|
||||
---
|
||||
|
||||
--- a/arch/arm64/boot/dts/mediatek/mt7622.dtsi
|
||||
+++ b/arch/arm64/boot/dts/mediatek/mt7622.dtsi
|
||||
@@ -357,7 +357,7 @@
|
||||
};
|
||||
|
||||
cci_control2: slave-if@5000 {
|
||||
- compatible = "arm,cci-400-ctrl-if";
|
||||
+ compatible = "arm,cci-400-ctrl-if", "syscon";
|
||||
interface-type = "ace";
|
||||
reg = <0x5000 0x1000>;
|
||||
};
|
||||
@@ -938,6 +938,8 @@
|
||||
power-domains = <&scpsys MT7622_POWER_DOMAIN_ETHSYS>;
|
||||
mediatek,ethsys = <ðsys>;
|
||||
mediatek,sgmiisys = <&sgmiisys>;
|
||||
+ mediatek,cci-control = <&cci_control2>;
|
||||
+ dma-coherent;
|
||||
#address-cells = <1>;
|
||||
#size-cells = <0>;
|
||||
status = "disabled";
|
File diff suppressed because it is too large
Load Diff
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user