mirror of
https://github.com/openwrt/openwrt.git
synced 2025-01-15 17:30:28 +00:00
a64b262046
Some backported patches generated with git-format-patch were not refreshed. Use 'make target/linux/refresh' to align them with OpenWrt's patch style. Signed-off-by: Daniel Golle <daniel@makrotopia.org>
353 lines
13 KiB
Diff
353 lines
13 KiB
Diff
From 8c20e2eb5f2a0175b774134685e4d7bd93e85ff8 Mon Sep 17 00:00:00 2001
|
|
From: Yu Zhao <yuzhao@google.com>
|
|
Date: Wed, 21 Dec 2022 21:18:59 -0700
|
|
Subject: [PATCH 01/19] UPSTREAM: mm: multi-gen LRU: rename lru_gen_struct to
|
|
lru_gen_folio
|
|
|
|
Patch series "mm: multi-gen LRU: memcg LRU", v3.
|
|
|
|
Overview
|
|
========
|
|
|
|
An memcg LRU is a per-node LRU of memcgs. It is also an LRU of LRUs,
|
|
since each node and memcg combination has an LRU of folios (see
|
|
mem_cgroup_lruvec()).
|
|
|
|
Its goal is to improve the scalability of global reclaim, which is
|
|
critical to system-wide memory overcommit in data centers. Note that
|
|
memcg reclaim is currently out of scope.
|
|
|
|
Its memory bloat is a pointer to each lruvec and negligible to each
|
|
pglist_data. In terms of traversing memcgs during global reclaim, it
|
|
improves the best-case complexity from O(n) to O(1) and does not affect
|
|
the worst-case complexity O(n). Therefore, on average, it has a sublinear
|
|
complexity in contrast to the current linear complexity.
|
|
|
|
The basic structure of an memcg LRU can be understood by an analogy to
|
|
the active/inactive LRU (of folios):
|
|
1. It has the young and the old (generations), i.e., the counterparts
|
|
to the active and the inactive;
|
|
2. The increment of max_seq triggers promotion, i.e., the counterpart
|
|
to activation;
|
|
3. Other events trigger similar operations, e.g., offlining an memcg
|
|
triggers demotion, i.e., the counterpart to deactivation.
|
|
|
|
In terms of global reclaim, it has two distinct features:
|
|
1. Sharding, which allows each thread to start at a random memcg (in
|
|
the old generation) and improves parallelism;
|
|
2. Eventual fairness, which allows direct reclaim to bail out at will
|
|
and reduces latency without affecting fairness over some time.
|
|
|
|
The commit message in patch 6 details the workflow:
|
|
https://lore.kernel.org/r/20221222041905.2431096-7-yuzhao@google.com/
|
|
|
|
The following is a simple test to quickly verify its effectiveness.
|
|
|
|
Test design:
|
|
1. Create multiple memcgs.
|
|
2. Each memcg contains a job (fio).
|
|
3. All jobs access the same amount of memory randomly.
|
|
4. The system does not experience global memory pressure.
|
|
5. Periodically write to the root memory.reclaim.
|
|
|
|
Desired outcome:
|
|
1. All memcgs have similar pgsteal counts, i.e., stddev(pgsteal)
|
|
over mean(pgsteal) is close to 0%.
|
|
2. The total pgsteal is close to the total requested through
|
|
memory.reclaim, i.e., sum(pgsteal) over sum(requested) is close
|
|
to 100%.
|
|
|
|
Actual outcome [1]:
|
|
MGLRU off MGLRU on
|
|
stddev(pgsteal) / mean(pgsteal) 75% 20%
|
|
sum(pgsteal) / sum(requested) 425% 95%
|
|
|
|
####################################################################
|
|
MEMCGS=128
|
|
|
|
for ((memcg = 0; memcg < $MEMCGS; memcg++)); do
|
|
mkdir /sys/fs/cgroup/memcg$memcg
|
|
done
|
|
|
|
start() {
|
|
echo $BASHPID > /sys/fs/cgroup/memcg$memcg/cgroup.procs
|
|
|
|
fio -name=memcg$memcg --numjobs=1 --ioengine=mmap \
|
|
--filename=/dev/zero --size=1920M --rw=randrw \
|
|
--rate=64m,64m --random_distribution=random \
|
|
--fadvise_hint=0 --time_based --runtime=10h \
|
|
--group_reporting --minimal
|
|
}
|
|
|
|
for ((memcg = 0; memcg < $MEMCGS; memcg++)); do
|
|
start &
|
|
done
|
|
|
|
sleep 600
|
|
|
|
for ((i = 0; i < 600; i++)); do
|
|
echo 256m >/sys/fs/cgroup/memory.reclaim
|
|
sleep 6
|
|
done
|
|
|
|
for ((memcg = 0; memcg < $MEMCGS; memcg++)); do
|
|
grep "pgsteal " /sys/fs/cgroup/memcg$memcg/memory.stat
|
|
done
|
|
####################################################################
|
|
|
|
[1]: This was obtained from running the above script (touches less
|
|
than 256GB memory) on an EPYC 7B13 with 512GB DRAM for over an
|
|
hour.
|
|
|
|
This patch (of 8):
|
|
|
|
The new name lru_gen_folio will be more distinct from the coming
|
|
lru_gen_memcg.
|
|
|
|
Link: https://lkml.kernel.org/r/20221222041905.2431096-1-yuzhao@google.com
|
|
Link: https://lkml.kernel.org/r/20221222041905.2431096-2-yuzhao@google.com
|
|
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
|
Cc: Johannes Weiner <hannes@cmpxchg.org>
|
|
Cc: Jonathan Corbet <corbet@lwn.net>
|
|
Cc: Michael Larabel <Michael@MichaelLarabel.com>
|
|
Cc: Michal Hocko <mhocko@kernel.org>
|
|
Cc: Mike Rapoport <rppt@kernel.org>
|
|
Cc: Roman Gushchin <roman.gushchin@linux.dev>
|
|
Cc: Suren Baghdasaryan <surenb@google.com>
|
|
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
|
Bug: 274865848
|
|
(cherry picked from commit 391655fe08d1f942359a11148aa9aaf3f99d6d6f)
|
|
Change-Id: I7df67e0e2435ba28f10eaa57d28d98b61a9210a6
|
|
Signed-off-by: T.J. Mercier <tjmercier@google.com>
|
|
---
|
|
include/linux/mm_inline.h | 4 ++--
|
|
include/linux/mmzone.h | 6 +++---
|
|
mm/vmscan.c | 34 +++++++++++++++++-----------------
|
|
mm/workingset.c | 4 ++--
|
|
4 files changed, 24 insertions(+), 24 deletions(-)
|
|
|
|
--- a/include/linux/mm_inline.h
|
|
+++ b/include/linux/mm_inline.h
|
|
@@ -178,7 +178,7 @@ static inline void lru_gen_update_size(s
|
|
int zone = folio_zonenum(folio);
|
|
int delta = folio_nr_pages(folio);
|
|
enum lru_list lru = type * LRU_INACTIVE_FILE;
|
|
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
|
+ struct lru_gen_folio *lrugen = &lruvec->lrugen;
|
|
|
|
VM_WARN_ON_ONCE(old_gen != -1 && old_gen >= MAX_NR_GENS);
|
|
VM_WARN_ON_ONCE(new_gen != -1 && new_gen >= MAX_NR_GENS);
|
|
@@ -224,7 +224,7 @@ static inline bool lru_gen_add_folio(str
|
|
int gen = folio_lru_gen(folio);
|
|
int type = folio_is_file_lru(folio);
|
|
int zone = folio_zonenum(folio);
|
|
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
|
+ struct lru_gen_folio *lrugen = &lruvec->lrugen;
|
|
|
|
VM_WARN_ON_ONCE_FOLIO(gen != -1, folio);
|
|
|
|
--- a/include/linux/mmzone.h
|
|
+++ b/include/linux/mmzone.h
|
|
@@ -404,7 +404,7 @@ enum {
|
|
* The number of pages in each generation is eventually consistent and therefore
|
|
* can be transiently negative when reset_batch_size() is pending.
|
|
*/
|
|
-struct lru_gen_struct {
|
|
+struct lru_gen_folio {
|
|
/* the aging increments the youngest generation number */
|
|
unsigned long max_seq;
|
|
/* the eviction increments the oldest generation numbers */
|
|
@@ -461,7 +461,7 @@ struct lru_gen_mm_state {
|
|
struct lru_gen_mm_walk {
|
|
/* the lruvec under reclaim */
|
|
struct lruvec *lruvec;
|
|
- /* unstable max_seq from lru_gen_struct */
|
|
+ /* unstable max_seq from lru_gen_folio */
|
|
unsigned long max_seq;
|
|
/* the next address within an mm to scan */
|
|
unsigned long next_addr;
|
|
@@ -524,7 +524,7 @@ struct lruvec {
|
|
unsigned long flags;
|
|
#ifdef CONFIG_LRU_GEN
|
|
/* evictable pages divided into generations */
|
|
- struct lru_gen_struct lrugen;
|
|
+ struct lru_gen_folio lrugen;
|
|
/* to concurrently iterate lru_gen_mm_list */
|
|
struct lru_gen_mm_state mm_state;
|
|
#endif
|
|
--- a/mm/vmscan.c
|
|
+++ b/mm/vmscan.c
|
|
@@ -3190,7 +3190,7 @@ static int get_nr_gens(struct lruvec *lr
|
|
|
|
static bool __maybe_unused seq_is_valid(struct lruvec *lruvec)
|
|
{
|
|
- /* see the comment on lru_gen_struct */
|
|
+ /* see the comment on lru_gen_folio */
|
|
return get_nr_gens(lruvec, LRU_GEN_FILE) >= MIN_NR_GENS &&
|
|
get_nr_gens(lruvec, LRU_GEN_FILE) <= get_nr_gens(lruvec, LRU_GEN_ANON) &&
|
|
get_nr_gens(lruvec, LRU_GEN_ANON) <= MAX_NR_GENS;
|
|
@@ -3596,7 +3596,7 @@ struct ctrl_pos {
|
|
static void read_ctrl_pos(struct lruvec *lruvec, int type, int tier, int gain,
|
|
struct ctrl_pos *pos)
|
|
{
|
|
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
|
+ struct lru_gen_folio *lrugen = &lruvec->lrugen;
|
|
int hist = lru_hist_from_seq(lrugen->min_seq[type]);
|
|
|
|
pos->refaulted = lrugen->avg_refaulted[type][tier] +
|
|
@@ -3611,7 +3611,7 @@ static void read_ctrl_pos(struct lruvec
|
|
static void reset_ctrl_pos(struct lruvec *lruvec, int type, bool carryover)
|
|
{
|
|
int hist, tier;
|
|
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
|
+ struct lru_gen_folio *lrugen = &lruvec->lrugen;
|
|
bool clear = carryover ? NR_HIST_GENS == 1 : NR_HIST_GENS > 1;
|
|
unsigned long seq = carryover ? lrugen->min_seq[type] : lrugen->max_seq + 1;
|
|
|
|
@@ -3688,7 +3688,7 @@ static int folio_update_gen(struct folio
|
|
static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
|
|
{
|
|
int type = folio_is_file_lru(folio);
|
|
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
|
+ struct lru_gen_folio *lrugen = &lruvec->lrugen;
|
|
int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]);
|
|
unsigned long new_flags, old_flags = READ_ONCE(folio->flags);
|
|
|
|
@@ -3733,7 +3733,7 @@ static void update_batch_size(struct lru
|
|
static void reset_batch_size(struct lruvec *lruvec, struct lru_gen_mm_walk *walk)
|
|
{
|
|
int gen, type, zone;
|
|
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
|
+ struct lru_gen_folio *lrugen = &lruvec->lrugen;
|
|
|
|
walk->batched = 0;
|
|
|
|
@@ -4250,7 +4250,7 @@ static bool inc_min_seq(struct lruvec *l
|
|
{
|
|
int zone;
|
|
int remaining = MAX_LRU_BATCH;
|
|
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
|
+ struct lru_gen_folio *lrugen = &lruvec->lrugen;
|
|
int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]);
|
|
|
|
if (type == LRU_GEN_ANON && !can_swap)
|
|
@@ -4286,7 +4286,7 @@ static bool try_to_inc_min_seq(struct lr
|
|
{
|
|
int gen, type, zone;
|
|
bool success = false;
|
|
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
|
+ struct lru_gen_folio *lrugen = &lruvec->lrugen;
|
|
DEFINE_MIN_SEQ(lruvec);
|
|
|
|
VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
|
|
@@ -4307,7 +4307,7 @@ next:
|
|
;
|
|
}
|
|
|
|
- /* see the comment on lru_gen_struct */
|
|
+ /* see the comment on lru_gen_folio */
|
|
if (can_swap) {
|
|
min_seq[LRU_GEN_ANON] = min(min_seq[LRU_GEN_ANON], min_seq[LRU_GEN_FILE]);
|
|
min_seq[LRU_GEN_FILE] = max(min_seq[LRU_GEN_ANON], lrugen->min_seq[LRU_GEN_FILE]);
|
|
@@ -4329,7 +4329,7 @@ static void inc_max_seq(struct lruvec *l
|
|
{
|
|
int prev, next;
|
|
int type, zone;
|
|
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
|
+ struct lru_gen_folio *lrugen = &lruvec->lrugen;
|
|
|
|
spin_lock_irq(&lruvec->lru_lock);
|
|
|
|
@@ -4387,7 +4387,7 @@ static bool try_to_inc_max_seq(struct lr
|
|
bool success;
|
|
struct lru_gen_mm_walk *walk;
|
|
struct mm_struct *mm = NULL;
|
|
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
|
+ struct lru_gen_folio *lrugen = &lruvec->lrugen;
|
|
|
|
VM_WARN_ON_ONCE(max_seq > READ_ONCE(lrugen->max_seq));
|
|
|
|
@@ -4452,7 +4452,7 @@ static bool should_run_aging(struct lruv
|
|
unsigned long old = 0;
|
|
unsigned long young = 0;
|
|
unsigned long total = 0;
|
|
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
|
+ struct lru_gen_folio *lrugen = &lruvec->lrugen;
|
|
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
|
|
|
|
for (type = !can_swap; type < ANON_AND_FILE; type++) {
|
|
@@ -4737,7 +4737,7 @@ static bool sort_folio(struct lruvec *lr
|
|
int delta = folio_nr_pages(folio);
|
|
int refs = folio_lru_refs(folio);
|
|
int tier = lru_tier_from_refs(refs);
|
|
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
|
+ struct lru_gen_folio *lrugen = &lruvec->lrugen;
|
|
|
|
VM_WARN_ON_ONCE_FOLIO(gen >= MAX_NR_GENS, folio);
|
|
|
|
@@ -4837,7 +4837,7 @@ static int scan_folios(struct lruvec *lr
|
|
int scanned = 0;
|
|
int isolated = 0;
|
|
int remaining = MAX_LRU_BATCH;
|
|
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
|
+ struct lru_gen_folio *lrugen = &lruvec->lrugen;
|
|
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
|
|
|
|
VM_WARN_ON_ONCE(!list_empty(list));
|
|
@@ -5237,7 +5237,7 @@ done:
|
|
|
|
static bool __maybe_unused state_is_valid(struct lruvec *lruvec)
|
|
{
|
|
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
|
+ struct lru_gen_folio *lrugen = &lruvec->lrugen;
|
|
|
|
if (lrugen->enabled) {
|
|
enum lru_list lru;
|
|
@@ -5519,7 +5519,7 @@ static void lru_gen_seq_show_full(struct
|
|
int i;
|
|
int type, tier;
|
|
int hist = lru_hist_from_seq(seq);
|
|
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
|
+ struct lru_gen_folio *lrugen = &lruvec->lrugen;
|
|
|
|
for (tier = 0; tier < MAX_NR_TIERS; tier++) {
|
|
seq_printf(m, " %10d", tier);
|
|
@@ -5569,7 +5569,7 @@ static int lru_gen_seq_show(struct seq_f
|
|
unsigned long seq;
|
|
bool full = !debugfs_real_fops(m->file)->write;
|
|
struct lruvec *lruvec = v;
|
|
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
|
+ struct lru_gen_folio *lrugen = &lruvec->lrugen;
|
|
int nid = lruvec_pgdat(lruvec)->node_id;
|
|
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
|
|
DEFINE_MAX_SEQ(lruvec);
|
|
@@ -5823,7 +5823,7 @@ void lru_gen_init_lruvec(struct lruvec *
|
|
{
|
|
int i;
|
|
int gen, type, zone;
|
|
- struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
|
+ struct lru_gen_folio *lrugen = &lruvec->lrugen;
|
|
|
|
lrugen->max_seq = MIN_NR_GENS + 1;
|
|
lrugen->enabled = lru_gen_enabled();
|
|
--- a/mm/workingset.c
|
|
+++ b/mm/workingset.c
|
|
@@ -223,7 +223,7 @@ static void *lru_gen_eviction(struct fol
|
|
unsigned long token;
|
|
unsigned long min_seq;
|
|
struct lruvec *lruvec;
|
|
- struct lru_gen_struct *lrugen;
|
|
+ struct lru_gen_folio *lrugen;
|
|
int type = folio_is_file_lru(folio);
|
|
int delta = folio_nr_pages(folio);
|
|
int refs = folio_lru_refs(folio);
|
|
@@ -252,7 +252,7 @@ static void lru_gen_refault(struct folio
|
|
unsigned long token;
|
|
unsigned long min_seq;
|
|
struct lruvec *lruvec;
|
|
- struct lru_gen_struct *lrugen;
|
|
+ struct lru_gen_folio *lrugen;
|
|
struct mem_cgroup *memcg;
|
|
struct pglist_data *pgdat;
|
|
int type = folio_is_file_lru(folio);
|