2023-03-21 06:51:03 +09:00
|
|
|
From 36a18a68ea458e8f4db2ca86b00091daf32c6c74 Mon Sep 17 00:00:00 2001
|
|
|
|
From: Yu Zhao <yuzhao@google.com>
|
|
|
|
Date: Sun, 18 Sep 2022 02:00:06 -0600
|
|
|
|
Subject: [PATCH 09/29] mm: multi-gen LRU: optimize multiple memcgs
|
|
|
|
MIME-Version: 1.0
|
|
|
|
Content-Type: text/plain; charset=UTF-8
|
|
|
|
Content-Transfer-Encoding: 8bit
|
|
|
|
|
|
|
|
When multiple memcgs are available, it is possible to use generations as a
|
|
|
|
frame of reference to make better choices and improve overall performance
|
|
|
|
under global memory pressure. This patch adds a basic optimization to
|
|
|
|
select memcgs that can drop single-use unmapped clean pages first. Doing
|
|
|
|
so reduces the chance of going into the aging path or swapping, which can
|
|
|
|
be costly.
|
|
|
|
|
|
|
|
A typical example that benefits from this optimization is a server running
|
|
|
|
mixed types of workloads, e.g., heavy anon workload in one memcg and heavy
|
|
|
|
buffered I/O workload in the other.
|
|
|
|
|
|
|
|
Though this optimization can be applied to both kswapd and direct reclaim,
|
|
|
|
it is only added to kswapd to keep the patchset manageable. Later
|
|
|
|
improvements may cover the direct reclaim path.
|
|
|
|
|
|
|
|
While ensuring certain fairness to all eligible memcgs, proportional scans
|
|
|
|
of individual memcgs also require proper backoff to avoid overshooting
|
|
|
|
their aggregate reclaim target by too much. Otherwise it can cause high
|
|
|
|
direct reclaim latency. The conditions for backoff are:
|
|
|
|
|
|
|
|
1. At low priorities, for direct reclaim, if aging fairness or direct
|
|
|
|
reclaim latency is at risk, i.e., aging one memcg multiple times or
|
|
|
|
swapping after the target is met.
|
|
|
|
2. At high priorities, for global reclaim, if per-zone free pages are
|
|
|
|
above respective watermarks.
|
|
|
|
|
|
|
|
Server benchmark results:
|
|
|
|
Mixed workloads:
|
|
|
|
fio (buffered I/O): +[19, 21]%
|
|
|
|
IOPS BW
|
|
|
|
patch1-8: 1880k 7343MiB/s
|
|
|
|
patch1-9: 2252k 8796MiB/s
|
|
|
|
|
|
|
|
memcached (anon): +[119, 123]%
|
|
|
|
Ops/sec KB/sec
|
|
|
|
patch1-8: 862768.65 33514.68
|
|
|
|
patch1-9: 1911022.12 74234.54
|
|
|
|
|
|
|
|
Mixed workloads:
|
|
|
|
fio (buffered I/O): +[75, 77]%
|
|
|
|
IOPS BW
|
|
|
|
5.19-rc1: 1279k 4996MiB/s
|
|
|
|
patch1-9: 2252k 8796MiB/s
|
|
|
|
|
|
|
|
memcached (anon): +[13, 15]%
|
|
|
|
Ops/sec KB/sec
|
|
|
|
5.19-rc1: 1673524.04 65008.87
|
|
|
|
patch1-9: 1911022.12 74234.54
|
|
|
|
|
|
|
|
Configurations:
|
|
|
|
(changes since patch 6)
|
|
|
|
|
|
|
|
cat mixed.sh
|
|
|
|
modprobe brd rd_nr=2 rd_size=56623104
|
|
|
|
|
|
|
|
swapoff -a
|
|
|
|
mkswap /dev/ram0
|
|
|
|
swapon /dev/ram0
|
|
|
|
|
|
|
|
mkfs.ext4 /dev/ram1
|
|
|
|
mount -t ext4 /dev/ram1 /mnt
|
|
|
|
|
|
|
|
memtier_benchmark -S /var/run/memcached/memcached.sock \
|
|
|
|
-P memcache_binary -n allkeys --key-minimum=1 \
|
|
|
|
--key-maximum=50000000 --key-pattern=P:P -c 1 -t 36 \
|
|
|
|
--ratio 1:0 --pipeline 8 -d 2000
|
|
|
|
|
|
|
|
fio -name=mglru --numjobs=36 --directory=/mnt --size=1408m \
|
|
|
|
--buffered=1 --ioengine=io_uring --iodepth=128 \
|
|
|
|
--iodepth_batch_submit=32 --iodepth_batch_complete=32 \
|
|
|
|
--rw=randread --random_distribution=random --norandommap \
|
|
|
|
--time_based --ramp_time=10m --runtime=90m --group_reporting &
|
|
|
|
pid=$!
|
|
|
|
|
|
|
|
sleep 200
|
|
|
|
|
|
|
|
memtier_benchmark -S /var/run/memcached/memcached.sock \
|
|
|
|
-P memcache_binary -n allkeys --key-minimum=1 \
|
|
|
|
--key-maximum=50000000 --key-pattern=R:R -c 1 -t 36 \
|
|
|
|
--ratio 0:1 --pipeline 8 --randomize --distinct-client-seed
|
|
|
|
|
|
|
|
kill -INT $pid
|
|
|
|
wait
|
|
|
|
|
|
|
|
Client benchmark results:
|
|
|
|
no change (CONFIG_MEMCG=n)
|
|
|
|
|
|
|
|
Link: https://lkml.kernel.org/r/20220918080010.2920238-10-yuzhao@google.com
|
|
|
|
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
|
|
|
Acked-by: Brian Geffon <bgeffon@google.com>
|
|
|
|
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
|
|
|
|
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
|
|
|
|
Acked-by: Steven Barrett <steven@liquorix.net>
|
|
|
|
Acked-by: Suleiman Souhlal <suleiman@google.com>
|
|
|
|
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
|
|
|
|
Tested-by: Donald Carr <d@chaos-reins.com>
|
|
|
|
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
|
|
|
|
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
|
|
|
|
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
|
|
|
|
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
|
|
|
|
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
|
|
|
|
Cc: Andi Kleen <ak@linux.intel.com>
|
|
|
|
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
|
|
|
|
Cc: Barry Song <baohua@kernel.org>
|
|
|
|
Cc: Catalin Marinas <catalin.marinas@arm.com>
|
|
|
|
Cc: Dave Hansen <dave.hansen@linux.intel.com>
|
|
|
|
Cc: Hillf Danton <hdanton@sina.com>
|
|
|
|
Cc: Jens Axboe <axboe@kernel.dk>
|
|
|
|
Cc: Johannes Weiner <hannes@cmpxchg.org>
|
|
|
|
Cc: Jonathan Corbet <corbet@lwn.net>
|
|
|
|
Cc: Linus Torvalds <torvalds@linux-foundation.org>
|
|
|
|
Cc: Matthew Wilcox <willy@infradead.org>
|
|
|
|
Cc: Mel Gorman <mgorman@suse.de>
|
|
|
|
Cc: Miaohe Lin <linmiaohe@huawei.com>
|
|
|
|
Cc: Michael Larabel <Michael@MichaelLarabel.com>
|
|
|
|
Cc: Michal Hocko <mhocko@kernel.org>
|
|
|
|
Cc: Mike Rapoport <rppt@kernel.org>
|
|
|
|
Cc: Mike Rapoport <rppt@linux.ibm.com>
|
|
|
|
Cc: Peter Zijlstra <peterz@infradead.org>
|
|
|
|
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
|
|
|
|
Cc: Tejun Heo <tj@kernel.org>
|
|
|
|
Cc: Vlastimil Babka <vbabka@suse.cz>
|
|
|
|
Cc: Will Deacon <will@kernel.org>
|
|
|
|
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
|
|
|
---
|
|
|
|
mm/vmscan.c | 105 +++++++++++++++++++++++++++++++++++++++++++++++-----
|
|
|
|
1 file changed, 96 insertions(+), 9 deletions(-)
|
|
|
|
|
|
|
|
--- a/mm/vmscan.c
|
|
|
|
+++ b/mm/vmscan.c
|
|
|
|
@@ -127,6 +127,12 @@ struct scan_control {
|
|
|
|
/* Always discard instead of demoting to lower tier memory */
|
|
|
|
unsigned int no_demotion:1;
|
|
|
|
|
|
|
|
+#ifdef CONFIG_LRU_GEN
|
|
|
|
+ /* help kswapd make better choices among multiple memcgs */
|
|
|
|
+ unsigned int memcgs_need_aging:1;
|
|
|
|
+ unsigned long last_reclaimed;
|
|
|
|
+#endif
|
|
|
|
+
|
|
|
|
/* Allocation order */
|
|
|
|
s8 order;
|
|
|
|
|
2023-03-25 17:24:27 +01:00
|
|
|
@@ -4202,6 +4208,19 @@ static void lru_gen_age_node(struct pgli
|
2023-03-21 06:51:03 +09:00
|
|
|
|
|
|
|
VM_WARN_ON_ONCE(!current_is_kswapd());
|
|
|
|
|
|
|
|
+ sc->last_reclaimed = sc->nr_reclaimed;
|
|
|
|
+
|
|
|
|
+ /*
|
|
|
|
+ * To reduce the chance of going into the aging path, which can be
|
|
|
|
+ * costly, optimistically skip it if the flag below was cleared in the
|
|
|
|
+ * eviction path. This improves the overall performance when multiple
|
|
|
|
+ * memcgs are available.
|
|
|
|
+ */
|
|
|
|
+ if (!sc->memcgs_need_aging) {
|
|
|
|
+ sc->memcgs_need_aging = true;
|
|
|
|
+ return;
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
set_mm_walk(pgdat);
|
|
|
|
|
|
|
|
memcg = mem_cgroup_iter(NULL, NULL, NULL);
|
2023-03-25 17:24:27 +01:00
|
|
|
@@ -4613,7 +4632,8 @@ static int isolate_pages(struct lruvec *
|
2023-03-21 06:51:03 +09:00
|
|
|
return scanned;
|
|
|
|
}
|
|
|
|
|
|
|
|
-static int evict_pages(struct lruvec *lruvec, struct scan_control *sc, int swappiness)
|
|
|
|
+static int evict_pages(struct lruvec *lruvec, struct scan_control *sc, int swappiness,
|
|
|
|
+ bool *need_swapping)
|
|
|
|
{
|
|
|
|
int type;
|
|
|
|
int scanned;
|
2023-03-25 17:24:27 +01:00
|
|
|
@@ -4676,6 +4696,9 @@ static int evict_pages(struct lruvec *lr
|
2023-03-21 06:51:03 +09:00
|
|
|
|
|
|
|
sc->nr_reclaimed += reclaimed;
|
|
|
|
|
|
|
|
+ if (need_swapping && type == LRU_GEN_ANON)
|
|
|
|
+ *need_swapping = true;
|
|
|
|
+
|
|
|
|
return scanned;
|
|
|
|
}
|
|
|
|
|
2023-03-25 17:24:27 +01:00
|
|
|
@@ -4685,9 +4708,8 @@ static int evict_pages(struct lruvec *lr
|
2023-03-21 06:51:03 +09:00
|
|
|
* reclaim.
|
|
|
|
*/
|
|
|
|
static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc,
|
|
|
|
- bool can_swap)
|
|
|
|
+ bool can_swap, bool *need_aging)
|
|
|
|
{
|
|
|
|
- bool need_aging;
|
|
|
|
unsigned long nr_to_scan;
|
|
|
|
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
|
|
|
|
DEFINE_MAX_SEQ(lruvec);
|
2023-03-25 17:24:27 +01:00
|
|
|
@@ -4697,8 +4719,8 @@ static unsigned long get_nr_to_scan(stru
|
2023-03-21 06:51:03 +09:00
|
|
|
(mem_cgroup_below_low(memcg) && !sc->memcg_low_reclaim))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
- need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, can_swap, &nr_to_scan);
|
|
|
|
- if (!need_aging)
|
|
|
|
+ *need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, can_swap, &nr_to_scan);
|
|
|
|
+ if (!*need_aging)
|
|
|
|
return nr_to_scan;
|
|
|
|
|
|
|
|
/* skip the aging path at the default priority */
|
2023-03-25 17:24:27 +01:00
|
|
|
@@ -4715,10 +4737,68 @@ done:
|
2023-03-21 06:51:03 +09:00
|
|
|
return min_seq[!can_swap] + MIN_NR_GENS <= max_seq ? nr_to_scan : 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
+static bool should_abort_scan(struct lruvec *lruvec, unsigned long seq,
|
|
|
|
+ struct scan_control *sc, bool need_swapping)
|
|
|
|
+{
|
|
|
|
+ int i;
|
|
|
|
+ DEFINE_MAX_SEQ(lruvec);
|
|
|
|
+
|
|
|
|
+ if (!current_is_kswapd()) {
|
|
|
|
+ /* age each memcg once to ensure fairness */
|
|
|
|
+ if (max_seq - seq > 1)
|
|
|
|
+ return true;
|
|
|
|
+
|
|
|
|
+ /* over-swapping can increase allocation latency */
|
|
|
|
+ if (sc->nr_reclaimed >= sc->nr_to_reclaim && need_swapping)
|
|
|
|
+ return true;
|
|
|
|
+
|
|
|
|
+ /* give this thread a chance to exit and free its memory */
|
|
|
|
+ if (fatal_signal_pending(current)) {
|
|
|
|
+ sc->nr_reclaimed += MIN_LRU_BATCH;
|
|
|
|
+ return true;
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ if (cgroup_reclaim(sc))
|
|
|
|
+ return false;
|
|
|
|
+ } else if (sc->nr_reclaimed - sc->last_reclaimed < sc->nr_to_reclaim)
|
|
|
|
+ return false;
|
|
|
|
+
|
|
|
|
+ /* keep scanning at low priorities to ensure fairness */
|
|
|
|
+ if (sc->priority > DEF_PRIORITY - 2)
|
|
|
|
+ return false;
|
|
|
|
+
|
|
|
|
+ /*
|
|
|
|
+ * A minimum amount of work was done under global memory pressure. For
|
|
|
|
+ * kswapd, it may be overshooting. For direct reclaim, the target isn't
|
|
|
|
+ * met, and yet the allocation may still succeed, since kswapd may have
|
|
|
|
+ * caught up. In either case, it's better to stop now, and restart if
|
|
|
|
+ * necessary.
|
|
|
|
+ */
|
|
|
|
+ for (i = 0; i <= sc->reclaim_idx; i++) {
|
|
|
|
+ unsigned long wmark;
|
|
|
|
+ struct zone *zone = lruvec_pgdat(lruvec)->node_zones + i;
|
|
|
|
+
|
|
|
|
+ if (!managed_zone(zone))
|
|
|
|
+ continue;
|
|
|
|
+
|
|
|
|
+ wmark = current_is_kswapd() ? high_wmark_pages(zone) : low_wmark_pages(zone);
|
|
|
|
+ if (wmark > zone_page_state(zone, NR_FREE_PAGES))
|
|
|
|
+ return false;
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ sc->nr_reclaimed += MIN_LRU_BATCH;
|
|
|
|
+
|
|
|
|
+ return true;
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
|
|
|
|
{
|
|
|
|
struct blk_plug plug;
|
|
|
|
+ bool need_aging = false;
|
|
|
|
+ bool need_swapping = false;
|
|
|
|
unsigned long scanned = 0;
|
|
|
|
+ unsigned long reclaimed = sc->nr_reclaimed;
|
|
|
|
+ DEFINE_MAX_SEQ(lruvec);
|
|
|
|
|
|
|
|
lru_add_drain();
|
|
|
|
|
2023-03-25 17:24:27 +01:00
|
|
|
@@ -4738,21 +4818,28 @@ static void lru_gen_shrink_lruvec(struct
|
2023-03-21 06:51:03 +09:00
|
|
|
else
|
|
|
|
swappiness = 0;
|
|
|
|
|
|
|
|
- nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness);
|
|
|
|
+ nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness, &need_aging);
|
|
|
|
if (!nr_to_scan)
|
|
|
|
- break;
|
|
|
|
+ goto done;
|
|
|
|
|
|
|
|
- delta = evict_pages(lruvec, sc, swappiness);
|
|
|
|
+ delta = evict_pages(lruvec, sc, swappiness, &need_swapping);
|
|
|
|
if (!delta)
|
|
|
|
- break;
|
|
|
|
+ goto done;
|
|
|
|
|
|
|
|
scanned += delta;
|
|
|
|
if (scanned >= nr_to_scan)
|
|
|
|
break;
|
|
|
|
|
|
|
|
+ if (should_abort_scan(lruvec, max_seq, sc, need_swapping))
|
|
|
|
+ break;
|
|
|
|
+
|
|
|
|
cond_resched();
|
|
|
|
}
|
|
|
|
|
|
|
|
+ /* see the comment in lru_gen_age_node() */
|
|
|
|
+ if (sc->nr_reclaimed - reclaimed >= MIN_LRU_BATCH && !need_aging)
|
|
|
|
+ sc->memcgs_need_aging = false;
|
|
|
|
+done:
|
|
|
|
clear_mm_walk();
|
|
|
|
|
|
|
|
blk_finish_plug(&plug);
|