mirror of
https://github.com/openwrt/openwrt.git
synced 2025-01-12 16:03:13 +00:00
387fde0da0
Removed because it is upstream: bcm53xx/patches-5.15/030-v5.16-0019-ARM-dts-BCM53573-Describe-on-SoC-BCM53125-rev-4-swit.patch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=cb1003c07e746e4e82bdd3959c9ea37018ed41a3 Removed because it is upstream: bcm53xx/patches-5.15/037-v6.6-0004-ARM-dts-BCM53573-Drop-nonexistent-default-off-LED-tr.patch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=c65a23e98e38dc991f495d6bdb3cfa6163a88a0c Removed because it is upstream: bcm53xx/patches-5.15/037-v6.6-0005-ARM-dts-BCM53573-Drop-nonexistent-usb-cells.patch https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=71475bcee001cae3844644c2787eef93b26489d1 Adapted hack-5.15/650-netfilter-add-xt_FLOWOFFLOAD-target.patch to match the changes from the upstream flow offload patch: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=7c71b831220edeab7ce603d818dc1708d9ea4137 Manually Adapted the following patch: bcm53xx/patches-5.15/035-v6.2-0004-ARM-dts-broadcom-align-LED-node-names-with-dtschema.patch Signed-off-by: Hauke Mehrtens <hauke@hauke-m.de>
808 lines
26 KiB
Diff
808 lines
26 KiB
Diff
From a9b328add8422921a0dbbef162730800e16e8cfd Mon Sep 17 00:00:00 2001
|
|
From: Yu Zhao <yuzhao@google.com>
|
|
Date: Sun, 18 Sep 2022 02:00:02 -0600
|
|
Subject: [PATCH 05/29] mm: multi-gen LRU: groundwork
|
|
MIME-Version: 1.0
|
|
Content-Type: text/plain; charset=UTF-8
|
|
Content-Transfer-Encoding: 8bit
|
|
|
|
Evictable pages are divided into multiple generations for each lruvec.
|
|
The youngest generation number is stored in lrugen->max_seq for both
|
|
anon and file types as they are aged on an equal footing. The oldest
|
|
generation numbers are stored in lrugen->min_seq[] separately for anon
|
|
and file types as clean file pages can be evicted regardless of swap
|
|
constraints. These three variables are monotonically increasing.
|
|
|
|
Generation numbers are truncated into order_base_2(MAX_NR_GENS+1) bits
|
|
in order to fit into the gen counter in page->flags. Each truncated
|
|
generation number is an index to lrugen->lists[]. The sliding window
|
|
technique is used to track at least MIN_NR_GENS and at most
|
|
MAX_NR_GENS generations. The gen counter stores a value within [1,
|
|
MAX_NR_GENS] while a page is on one of lrugen->lists[]. Otherwise it
|
|
stores 0.
|
|
|
|
There are two conceptually independent procedures: "the aging", which
|
|
produces young generations, and "the eviction", which consumes old
|
|
generations. They form a closed-loop system, i.e., "the page reclaim".
|
|
Both procedures can be invoked from userspace for the purposes of working
|
|
set estimation and proactive reclaim. These techniques are commonly used
|
|
to optimize job scheduling (bin packing) in data centers [1][2].
|
|
|
|
To avoid confusion, the terms "hot" and "cold" will be applied to the
|
|
multi-gen LRU, as a new convention; the terms "active" and "inactive" will
|
|
be applied to the active/inactive LRU, as usual.
|
|
|
|
The protection of hot pages and the selection of cold pages are based
|
|
on page access channels and patterns. There are two access channels:
|
|
one through page tables and the other through file descriptors. The
|
|
protection of the former channel is by design stronger because:
|
|
1. The uncertainty in determining the access patterns of the former
|
|
channel is higher due to the approximation of the accessed bit.
|
|
2. The cost of evicting the former channel is higher due to the TLB
|
|
flushes required and the likelihood of encountering the dirty bit.
|
|
3. The penalty of underprotecting the former channel is higher because
|
|
applications usually do not prepare themselves for major page
|
|
faults like they do for blocked I/O. E.g., GUI applications
|
|
commonly use dedicated I/O threads to avoid blocking rendering
|
|
threads.
|
|
|
|
There are also two access patterns: one with temporal locality and the
|
|
other without. For the reasons listed above, the former channel is
|
|
assumed to follow the former pattern unless VM_SEQ_READ or VM_RAND_READ is
|
|
present; the latter channel is assumed to follow the latter pattern unless
|
|
outlying refaults have been observed [3][4].
|
|
|
|
The next patch will address the "outlying refaults". Three macros, i.e.,
|
|
LRU_REFS_WIDTH, LRU_REFS_PGOFF and LRU_REFS_MASK, used later are added in
|
|
this patch to make the entire patchset less diffy.
|
|
|
|
A page is added to the youngest generation on faulting. The aging needs
|
|
to check the accessed bit at least twice before handing this page over to
|
|
the eviction. The first check takes care of the accessed bit set on the
|
|
initial fault; the second check makes sure this page has not been used
|
|
since then. This protocol, AKA second chance, requires a minimum of two
|
|
generations, hence MIN_NR_GENS.
|
|
|
|
[1] https://dl.acm.org/doi/10.1145/3297858.3304053
|
|
[2] https://dl.acm.org/doi/10.1145/3503222.3507731
|
|
[3] https://lwn.net/Articles/495543/
|
|
[4] https://lwn.net/Articles/815342/
|
|
|
|
Link: https://lkml.kernel.org/r/20220918080010.2920238-6-yuzhao@google.com
|
|
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
|
Acked-by: Brian Geffon <bgeffon@google.com>
|
|
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
|
|
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
|
|
Acked-by: Steven Barrett <steven@liquorix.net>
|
|
Acked-by: Suleiman Souhlal <suleiman@google.com>
|
|
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
|
|
Tested-by: Donald Carr <d@chaos-reins.com>
|
|
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
|
|
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
|
|
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
|
|
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
|
|
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
|
|
Cc: Andi Kleen <ak@linux.intel.com>
|
|
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
|
|
Cc: Barry Song <baohua@kernel.org>
|
|
Cc: Catalin Marinas <catalin.marinas@arm.com>
|
|
Cc: Dave Hansen <dave.hansen@linux.intel.com>
|
|
Cc: Hillf Danton <hdanton@sina.com>
|
|
Cc: Jens Axboe <axboe@kernel.dk>
|
|
Cc: Johannes Weiner <hannes@cmpxchg.org>
|
|
Cc: Jonathan Corbet <corbet@lwn.net>
|
|
Cc: Linus Torvalds <torvalds@linux-foundation.org>
|
|
Cc: Matthew Wilcox <willy@infradead.org>
|
|
Cc: Mel Gorman <mgorman@suse.de>
|
|
Cc: Miaohe Lin <linmiaohe@huawei.com>
|
|
Cc: Michael Larabel <Michael@MichaelLarabel.com>
|
|
Cc: Michal Hocko <mhocko@kernel.org>
|
|
Cc: Mike Rapoport <rppt@kernel.org>
|
|
Cc: Mike Rapoport <rppt@linux.ibm.com>
|
|
Cc: Peter Zijlstra <peterz@infradead.org>
|
|
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
|
|
Cc: Tejun Heo <tj@kernel.org>
|
|
Cc: Vlastimil Babka <vbabka@suse.cz>
|
|
Cc: Will Deacon <will@kernel.org>
|
|
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
|
---
|
|
fs/fuse/dev.c | 3 +-
|
|
include/linux/mm.h | 2 +
|
|
include/linux/mm_inline.h | 177 +++++++++++++++++++++++++++++-
|
|
include/linux/mmzone.h | 100 +++++++++++++++++
|
|
include/linux/page-flags-layout.h | 13 ++-
|
|
include/linux/page-flags.h | 4 +-
|
|
include/linux/sched.h | 4 +
|
|
kernel/bounds.c | 5 +
|
|
mm/Kconfig | 8 ++
|
|
mm/huge_memory.c | 3 +-
|
|
mm/memcontrol.c | 2 +
|
|
mm/memory.c | 25 +++++
|
|
mm/mm_init.c | 6 +-
|
|
mm/mmzone.c | 2 +
|
|
mm/swap.c | 10 +-
|
|
mm/vmscan.c | 75 +++++++++++++
|
|
16 files changed, 425 insertions(+), 14 deletions(-)
|
|
|
|
--- a/fs/fuse/dev.c
|
|
+++ b/fs/fuse/dev.c
|
|
@@ -785,7 +785,8 @@ static int fuse_check_page(struct page *
|
|
1 << PG_active |
|
|
1 << PG_workingset |
|
|
1 << PG_reclaim |
|
|
- 1 << PG_waiters))) {
|
|
+ 1 << PG_waiters |
|
|
+ LRU_GEN_MASK | LRU_REFS_MASK))) {
|
|
dump_page(page, "fuse: trying to steal weird page");
|
|
return 1;
|
|
}
|
|
--- a/include/linux/mm.h
|
|
+++ b/include/linux/mm.h
|
|
@@ -1093,6 +1093,8 @@ vm_fault_t finish_mkwrite_fault(struct v
|
|
#define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH)
|
|
#define LAST_CPUPID_PGOFF (ZONES_PGOFF - LAST_CPUPID_WIDTH)
|
|
#define KASAN_TAG_PGOFF (LAST_CPUPID_PGOFF - KASAN_TAG_WIDTH)
|
|
+#define LRU_GEN_PGOFF (KASAN_TAG_PGOFF - LRU_GEN_WIDTH)
|
|
+#define LRU_REFS_PGOFF (LRU_GEN_PGOFF - LRU_REFS_WIDTH)
|
|
|
|
/*
|
|
* Define the bit shifts to access each section. For non-existent
|
|
--- a/include/linux/mm_inline.h
|
|
+++ b/include/linux/mm_inline.h
|
|
@@ -26,10 +26,13 @@ static inline int page_is_file_lru(struc
|
|
|
|
static __always_inline void __update_lru_size(struct lruvec *lruvec,
|
|
enum lru_list lru, enum zone_type zid,
|
|
- int nr_pages)
|
|
+ long nr_pages)
|
|
{
|
|
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
|
|
|
|
+ lockdep_assert_held(&lruvec->lru_lock);
|
|
+ WARN_ON_ONCE(nr_pages != (int)nr_pages);
|
|
+
|
|
__mod_lruvec_state(lruvec, NR_LRU_BASE + lru, nr_pages);
|
|
__mod_zone_page_state(&pgdat->node_zones[zid],
|
|
NR_ZONE_LRU_BASE + lru, nr_pages);
|
|
@@ -86,11 +89,177 @@ static __always_inline enum lru_list pag
|
|
return lru;
|
|
}
|
|
|
|
+#ifdef CONFIG_LRU_GEN
|
|
+
|
|
+static inline bool lru_gen_enabled(void)
|
|
+{
|
|
+ return true;
|
|
+}
|
|
+
|
|
+static inline bool lru_gen_in_fault(void)
|
|
+{
|
|
+ return current->in_lru_fault;
|
|
+}
|
|
+
|
|
+static inline int lru_gen_from_seq(unsigned long seq)
|
|
+{
|
|
+ return seq % MAX_NR_GENS;
|
|
+}
|
|
+
|
|
+static inline int page_lru_gen(struct page *page)
|
|
+{
|
|
+ unsigned long flags = READ_ONCE(page->flags);
|
|
+
|
|
+ return ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
|
|
+}
|
|
+
|
|
+static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen)
|
|
+{
|
|
+ unsigned long max_seq = lruvec->lrugen.max_seq;
|
|
+
|
|
+ VM_WARN_ON_ONCE(gen >= MAX_NR_GENS);
|
|
+
|
|
+ /* see the comment on MIN_NR_GENS */
|
|
+ return gen == lru_gen_from_seq(max_seq) || gen == lru_gen_from_seq(max_seq - 1);
|
|
+}
|
|
+
|
|
+static inline void lru_gen_update_size(struct lruvec *lruvec, struct page *page,
|
|
+ int old_gen, int new_gen)
|
|
+{
|
|
+ int type = page_is_file_lru(page);
|
|
+ int zone = page_zonenum(page);
|
|
+ int delta = thp_nr_pages(page);
|
|
+ enum lru_list lru = type * LRU_INACTIVE_FILE;
|
|
+ struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
|
+
|
|
+ VM_WARN_ON_ONCE(old_gen != -1 && old_gen >= MAX_NR_GENS);
|
|
+ VM_WARN_ON_ONCE(new_gen != -1 && new_gen >= MAX_NR_GENS);
|
|
+ VM_WARN_ON_ONCE(old_gen == -1 && new_gen == -1);
|
|
+
|
|
+ if (old_gen >= 0)
|
|
+ WRITE_ONCE(lrugen->nr_pages[old_gen][type][zone],
|
|
+ lrugen->nr_pages[old_gen][type][zone] - delta);
|
|
+ if (new_gen >= 0)
|
|
+ WRITE_ONCE(lrugen->nr_pages[new_gen][type][zone],
|
|
+ lrugen->nr_pages[new_gen][type][zone] + delta);
|
|
+
|
|
+ /* addition */
|
|
+ if (old_gen < 0) {
|
|
+ if (lru_gen_is_active(lruvec, new_gen))
|
|
+ lru += LRU_ACTIVE;
|
|
+ __update_lru_size(lruvec, lru, zone, delta);
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ /* deletion */
|
|
+ if (new_gen < 0) {
|
|
+ if (lru_gen_is_active(lruvec, old_gen))
|
|
+ lru += LRU_ACTIVE;
|
|
+ __update_lru_size(lruvec, lru, zone, -delta);
|
|
+ return;
|
|
+ }
|
|
+}
|
|
+
|
|
+static inline bool lru_gen_add_page(struct lruvec *lruvec, struct page *page, bool reclaiming)
|
|
+{
|
|
+ unsigned long seq;
|
|
+ unsigned long flags;
|
|
+ int gen = page_lru_gen(page);
|
|
+ int type = page_is_file_lru(page);
|
|
+ int zone = page_zonenum(page);
|
|
+ struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
|
+
|
|
+ VM_WARN_ON_ONCE_PAGE(gen != -1, page);
|
|
+
|
|
+ if (PageUnevictable(page))
|
|
+ return false;
|
|
+ /*
|
|
+ * There are three common cases for this page:
|
|
+ * 1. If it's hot, e.g., freshly faulted in or previously hot and
|
|
+ * migrated, add it to the youngest generation.
|
|
+ * 2. If it's cold but can't be evicted immediately, i.e., an anon page
|
|
+ * not in swapcache or a dirty page pending writeback, add it to the
|
|
+ * second oldest generation.
|
|
+ * 3. Everything else (clean, cold) is added to the oldest generation.
|
|
+ */
|
|
+ if (PageActive(page))
|
|
+ seq = lrugen->max_seq;
|
|
+ else if ((type == LRU_GEN_ANON && !PageSwapCache(page)) ||
|
|
+ (PageReclaim(page) &&
|
|
+ (PageDirty(page) || PageWriteback(page))))
|
|
+ seq = lrugen->min_seq[type] + 1;
|
|
+ else
|
|
+ seq = lrugen->min_seq[type];
|
|
+
|
|
+ gen = lru_gen_from_seq(seq);
|
|
+ flags = (gen + 1UL) << LRU_GEN_PGOFF;
|
|
+ /* see the comment on MIN_NR_GENS about PG_active */
|
|
+ set_mask_bits(&page->flags, LRU_GEN_MASK | BIT(PG_active), flags);
|
|
+
|
|
+ lru_gen_update_size(lruvec, page, -1, gen);
|
|
+ /* for rotate_reclaimable_page() */
|
|
+ if (reclaiming)
|
|
+ list_add_tail(&page->lru, &lrugen->lists[gen][type][zone]);
|
|
+ else
|
|
+ list_add(&page->lru, &lrugen->lists[gen][type][zone]);
|
|
+
|
|
+ return true;
|
|
+}
|
|
+
|
|
+static inline bool lru_gen_del_page(struct lruvec *lruvec, struct page *page, bool reclaiming)
|
|
+{
|
|
+ unsigned long flags;
|
|
+ int gen = page_lru_gen(page);
|
|
+
|
|
+ if (gen < 0)
|
|
+ return false;
|
|
+
|
|
+ VM_WARN_ON_ONCE_PAGE(PageActive(page), page);
|
|
+ VM_WARN_ON_ONCE_PAGE(PageUnevictable(page), page);
|
|
+
|
|
+ /* for migrate_page_states() */
|
|
+ flags = !reclaiming && lru_gen_is_active(lruvec, gen) ? BIT(PG_active) : 0;
|
|
+ flags = set_mask_bits(&page->flags, LRU_GEN_MASK, flags);
|
|
+ gen = ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
|
|
+
|
|
+ lru_gen_update_size(lruvec, page, gen, -1);
|
|
+ list_del(&page->lru);
|
|
+
|
|
+ return true;
|
|
+}
|
|
+
|
|
+#else /* !CONFIG_LRU_GEN */
|
|
+
|
|
+static inline bool lru_gen_enabled(void)
|
|
+{
|
|
+ return false;
|
|
+}
|
|
+
|
|
+static inline bool lru_gen_in_fault(void)
|
|
+{
|
|
+ return false;
|
|
+}
|
|
+
|
|
+static inline bool lru_gen_add_page(struct lruvec *lruvec, struct page *page, bool reclaiming)
|
|
+{
|
|
+ return false;
|
|
+}
|
|
+
|
|
+static inline bool lru_gen_del_page(struct lruvec *lruvec, struct page *page, bool reclaiming)
|
|
+{
|
|
+ return false;
|
|
+}
|
|
+
|
|
+#endif /* CONFIG_LRU_GEN */
|
|
+
|
|
static __always_inline void add_page_to_lru_list(struct page *page,
|
|
struct lruvec *lruvec)
|
|
{
|
|
enum lru_list lru = page_lru(page);
|
|
|
|
+ if (lru_gen_add_page(lruvec, page, false))
|
|
+ return;
|
|
+
|
|
update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page));
|
|
list_add(&page->lru, &lruvec->lists[lru]);
|
|
}
|
|
@@ -100,6 +269,9 @@ static __always_inline void add_page_to_
|
|
{
|
|
enum lru_list lru = page_lru(page);
|
|
|
|
+ if (lru_gen_add_page(lruvec, page, true))
|
|
+ return;
|
|
+
|
|
update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page));
|
|
list_add_tail(&page->lru, &lruvec->lists[lru]);
|
|
}
|
|
@@ -107,6 +279,9 @@ static __always_inline void add_page_to_
|
|
static __always_inline void del_page_from_lru_list(struct page *page,
|
|
struct lruvec *lruvec)
|
|
{
|
|
+ if (lru_gen_del_page(lruvec, page, false))
|
|
+ return;
|
|
+
|
|
list_del(&page->lru);
|
|
update_lru_size(lruvec, page_lru(page), page_zonenum(page),
|
|
-thp_nr_pages(page));
|
|
--- a/include/linux/mmzone.h
|
|
+++ b/include/linux/mmzone.h
|
|
@@ -294,6 +294,102 @@ enum lruvec_flags {
|
|
*/
|
|
};
|
|
|
|
+#endif /* !__GENERATING_BOUNDS_H */
|
|
+
|
|
+/*
|
|
+ * Evictable pages are divided into multiple generations. The youngest and the
|
|
+ * oldest generation numbers, max_seq and min_seq, are monotonically increasing.
|
|
+ * They form a sliding window of a variable size [MIN_NR_GENS, MAX_NR_GENS]. An
|
|
+ * offset within MAX_NR_GENS, i.e., gen, indexes the LRU list of the
|
|
+ * corresponding generation. The gen counter in page->flags stores gen+1 while
|
|
+ * a page is on one of lrugen->lists[]. Otherwise it stores 0.
|
|
+ *
|
|
+ * A page is added to the youngest generation on faulting. The aging needs to
|
|
+ * check the accessed bit at least twice before handing this page over to the
|
|
+ * eviction. The first check takes care of the accessed bit set on the initial
|
|
+ * fault; the second check makes sure this page hasn't been used since then.
|
|
+ * This process, AKA second chance, requires a minimum of two generations,
|
|
+ * hence MIN_NR_GENS. And to maintain ABI compatibility with the active/inactive
|
|
+ * LRU, e.g., /proc/vmstat, these two generations are considered active; the
|
|
+ * rest of generations, if they exist, are considered inactive. See
|
|
+ * lru_gen_is_active().
|
|
+ *
|
|
+ * PG_active is always cleared while a page is on one of lrugen->lists[] so that
|
|
+ * the aging needs not to worry about it. And it's set again when a page
|
|
+ * considered active is isolated for non-reclaiming purposes, e.g., migration.
|
|
+ * See lru_gen_add_page() and lru_gen_del_page().
|
|
+ *
|
|
+ * MAX_NR_GENS is set to 4 so that the multi-gen LRU can support twice the
|
|
+ * number of categories of the active/inactive LRU when keeping track of
|
|
+ * accesses through page tables. This requires order_base_2(MAX_NR_GENS+1) bits
|
|
+ * in page->flags.
|
|
+ */
|
|
+#define MIN_NR_GENS 2U
|
|
+#define MAX_NR_GENS 4U
|
|
+
|
|
+#ifndef __GENERATING_BOUNDS_H
|
|
+
|
|
+struct lruvec;
|
|
+
|
|
+#define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF)
|
|
+#define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF)
|
|
+
|
|
+#ifdef CONFIG_LRU_GEN
|
|
+
|
|
+enum {
|
|
+ LRU_GEN_ANON,
|
|
+ LRU_GEN_FILE,
|
|
+};
|
|
+
|
|
+/*
|
|
+ * The youngest generation number is stored in max_seq for both anon and file
|
|
+ * types as they are aged on an equal footing. The oldest generation numbers are
|
|
+ * stored in min_seq[] separately for anon and file types as clean file pages
|
|
+ * can be evicted regardless of swap constraints.
|
|
+ *
|
|
+ * Normally anon and file min_seq are in sync. But if swapping is constrained,
|
|
+ * e.g., out of swap space, file min_seq is allowed to advance and leave anon
|
|
+ * min_seq behind.
|
|
+ *
|
|
+ * The number of pages in each generation is eventually consistent and therefore
|
|
+ * can be transiently negative.
|
|
+ */
|
|
+struct lru_gen_struct {
|
|
+ /* the aging increments the youngest generation number */
|
|
+ unsigned long max_seq;
|
|
+ /* the eviction increments the oldest generation numbers */
|
|
+ unsigned long min_seq[ANON_AND_FILE];
|
|
+ /* the multi-gen LRU lists, lazily sorted on eviction */
|
|
+ struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
|
|
+ /* the multi-gen LRU sizes, eventually consistent */
|
|
+ long nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
|
|
+};
|
|
+
|
|
+void lru_gen_init_lruvec(struct lruvec *lruvec);
|
|
+
|
|
+#ifdef CONFIG_MEMCG
|
|
+void lru_gen_init_memcg(struct mem_cgroup *memcg);
|
|
+void lru_gen_exit_memcg(struct mem_cgroup *memcg);
|
|
+#endif
|
|
+
|
|
+#else /* !CONFIG_LRU_GEN */
|
|
+
|
|
+static inline void lru_gen_init_lruvec(struct lruvec *lruvec)
|
|
+{
|
|
+}
|
|
+
|
|
+#ifdef CONFIG_MEMCG
|
|
+static inline void lru_gen_init_memcg(struct mem_cgroup *memcg)
|
|
+{
|
|
+}
|
|
+
|
|
+static inline void lru_gen_exit_memcg(struct mem_cgroup *memcg)
|
|
+{
|
|
+}
|
|
+#endif
|
|
+
|
|
+#endif /* CONFIG_LRU_GEN */
|
|
+
|
|
struct lruvec {
|
|
struct list_head lists[NR_LRU_LISTS];
|
|
/* per lruvec lru_lock for memcg */
|
|
@@ -311,6 +407,10 @@ struct lruvec {
|
|
unsigned long refaults[ANON_AND_FILE];
|
|
/* Various lruvec state flags (enum lruvec_flags) */
|
|
unsigned long flags;
|
|
+#ifdef CONFIG_LRU_GEN
|
|
+ /* evictable pages divided into generations */
|
|
+ struct lru_gen_struct lrugen;
|
|
+#endif
|
|
#ifdef CONFIG_MEMCG
|
|
struct pglist_data *pgdat;
|
|
#endif
|
|
--- a/include/linux/page-flags-layout.h
|
|
+++ b/include/linux/page-flags-layout.h
|
|
@@ -55,7 +55,8 @@
|
|
#define SECTIONS_WIDTH 0
|
|
#endif
|
|
|
|
-#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS
|
|
+#if ZONES_WIDTH + LRU_GEN_WIDTH + SECTIONS_WIDTH + NODES_SHIFT \
|
|
+ <= BITS_PER_LONG - NR_PAGEFLAGS
|
|
#define NODES_WIDTH NODES_SHIFT
|
|
#elif defined(CONFIG_SPARSEMEM_VMEMMAP)
|
|
#error "Vmemmap: No space for nodes field in page flags"
|
|
@@ -89,8 +90,8 @@
|
|
#define LAST_CPUPID_SHIFT 0
|
|
#endif
|
|
|
|
-#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + KASAN_TAG_WIDTH + LAST_CPUPID_SHIFT \
|
|
- <= BITS_PER_LONG - NR_PAGEFLAGS
|
|
+#if ZONES_WIDTH + LRU_GEN_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + \
|
|
+ KASAN_TAG_WIDTH + LAST_CPUPID_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS
|
|
#define LAST_CPUPID_WIDTH LAST_CPUPID_SHIFT
|
|
#else
|
|
#define LAST_CPUPID_WIDTH 0
|
|
@@ -100,10 +101,12 @@
|
|
#define LAST_CPUPID_NOT_IN_PAGE_FLAGS
|
|
#endif
|
|
|
|
-#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + KASAN_TAG_WIDTH + LAST_CPUPID_WIDTH \
|
|
- > BITS_PER_LONG - NR_PAGEFLAGS
|
|
+#if ZONES_WIDTH + LRU_GEN_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + \
|
|
+ KASAN_TAG_WIDTH + LAST_CPUPID_WIDTH > BITS_PER_LONG - NR_PAGEFLAGS
|
|
#error "Not enough bits in page flags"
|
|
#endif
|
|
|
|
+#define LRU_REFS_WIDTH 0
|
|
+
|
|
#endif
|
|
#endif /* _LINUX_PAGE_FLAGS_LAYOUT */
|
|
--- a/include/linux/page-flags.h
|
|
+++ b/include/linux/page-flags.h
|
|
@@ -845,7 +845,7 @@ static inline void ClearPageSlabPfmemall
|
|
1UL << PG_private | 1UL << PG_private_2 | \
|
|
1UL << PG_writeback | 1UL << PG_reserved | \
|
|
1UL << PG_slab | 1UL << PG_active | \
|
|
- 1UL << PG_unevictable | __PG_MLOCKED)
|
|
+ 1UL << PG_unevictable | __PG_MLOCKED | LRU_GEN_MASK)
|
|
|
|
/*
|
|
* Flags checked when a page is prepped for return by the page allocator.
|
|
@@ -856,7 +856,7 @@ static inline void ClearPageSlabPfmemall
|
|
* alloc-free cycle to prevent from reusing the page.
|
|
*/
|
|
#define PAGE_FLAGS_CHECK_AT_PREP \
|
|
- (PAGEFLAGS_MASK & ~__PG_HWPOISON)
|
|
+ ((PAGEFLAGS_MASK & ~__PG_HWPOISON) | LRU_GEN_MASK | LRU_REFS_MASK)
|
|
|
|
#define PAGE_FLAGS_PRIVATE \
|
|
(1UL << PG_private | 1UL << PG_private_2)
|
|
--- a/include/linux/sched.h
|
|
+++ b/include/linux/sched.h
|
|
@@ -907,6 +907,10 @@ struct task_struct {
|
|
#ifdef CONFIG_MEMCG
|
|
unsigned in_user_fault:1;
|
|
#endif
|
|
+#ifdef CONFIG_LRU_GEN
|
|
+ /* whether the LRU algorithm may apply to this access */
|
|
+ unsigned in_lru_fault:1;
|
|
+#endif
|
|
#ifdef CONFIG_COMPAT_BRK
|
|
unsigned brk_randomized:1;
|
|
#endif
|
|
--- a/kernel/bounds.c
|
|
+++ b/kernel/bounds.c
|
|
@@ -22,6 +22,11 @@ int main(void)
|
|
DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS));
|
|
#endif
|
|
DEFINE(SPINLOCK_SIZE, sizeof(spinlock_t));
|
|
+#ifdef CONFIG_LRU_GEN
|
|
+ DEFINE(LRU_GEN_WIDTH, order_base_2(MAX_NR_GENS + 1));
|
|
+#else
|
|
+ DEFINE(LRU_GEN_WIDTH, 0);
|
|
+#endif
|
|
/* End of constants */
|
|
|
|
return 0;
|
|
--- a/mm/Kconfig
|
|
+++ b/mm/Kconfig
|
|
@@ -897,6 +897,14 @@ config IO_MAPPING
|
|
config SECRETMEM
|
|
def_bool ARCH_HAS_SET_DIRECT_MAP && !EMBEDDED
|
|
|
|
+config LRU_GEN
|
|
+ bool "Multi-Gen LRU"
|
|
+ depends on MMU
|
|
+ # make sure page->flags has enough spare bits
|
|
+ depends on 64BIT || !SPARSEMEM || SPARSEMEM_VMEMMAP
|
|
+ help
|
|
+ A high performance LRU implementation to overcommit memory.
|
|
+
|
|
source "mm/damon/Kconfig"
|
|
|
|
endmenu
|
|
--- a/mm/huge_memory.c
|
|
+++ b/mm/huge_memory.c
|
|
@@ -2366,7 +2366,8 @@ static void __split_huge_page_tail(struc
|
|
#ifdef CONFIG_64BIT
|
|
(1L << PG_arch_2) |
|
|
#endif
|
|
- (1L << PG_dirty)));
|
|
+ (1L << PG_dirty) |
|
|
+ LRU_GEN_MASK | LRU_REFS_MASK));
|
|
|
|
/* ->mapping in first tail page is compound_mapcount */
|
|
VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
|
|
--- a/mm/memcontrol.c
|
|
+++ b/mm/memcontrol.c
|
|
@@ -5179,6 +5179,7 @@ static void __mem_cgroup_free(struct mem
|
|
|
|
static void mem_cgroup_free(struct mem_cgroup *memcg)
|
|
{
|
|
+ lru_gen_exit_memcg(memcg);
|
|
memcg_wb_domain_exit(memcg);
|
|
__mem_cgroup_free(memcg);
|
|
}
|
|
@@ -5242,6 +5243,7 @@ static struct mem_cgroup *mem_cgroup_all
|
|
memcg->deferred_split_queue.split_queue_len = 0;
|
|
#endif
|
|
idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
|
|
+ lru_gen_init_memcg(memcg);
|
|
return memcg;
|
|
fail:
|
|
mem_cgroup_id_remove(memcg);
|
|
--- a/mm/memory.c
|
|
+++ b/mm/memory.c
|
|
@@ -4805,6 +4805,27 @@ static inline void mm_account_fault(stru
|
|
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address);
|
|
}
|
|
|
|
+#ifdef CONFIG_LRU_GEN
|
|
+static void lru_gen_enter_fault(struct vm_area_struct *vma)
|
|
+{
|
|
+ /* the LRU algorithm doesn't apply to sequential or random reads */
|
|
+ current->in_lru_fault = !(vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ));
|
|
+}
|
|
+
|
|
+static void lru_gen_exit_fault(void)
|
|
+{
|
|
+ current->in_lru_fault = false;
|
|
+}
|
|
+#else
|
|
+static void lru_gen_enter_fault(struct vm_area_struct *vma)
|
|
+{
|
|
+}
|
|
+
|
|
+static void lru_gen_exit_fault(void)
|
|
+{
|
|
+}
|
|
+#endif /* CONFIG_LRU_GEN */
|
|
+
|
|
/*
|
|
* By the time we get here, we already hold the mm semaphore
|
|
*
|
|
@@ -4836,11 +4857,15 @@ vm_fault_t handle_mm_fault(struct vm_are
|
|
if (flags & FAULT_FLAG_USER)
|
|
mem_cgroup_enter_user_fault();
|
|
|
|
+ lru_gen_enter_fault(vma);
|
|
+
|
|
if (unlikely(is_vm_hugetlb_page(vma)))
|
|
ret = hugetlb_fault(vma->vm_mm, vma, address, flags);
|
|
else
|
|
ret = __handle_mm_fault(vma, address, flags);
|
|
|
|
+ lru_gen_exit_fault();
|
|
+
|
|
if (flags & FAULT_FLAG_USER) {
|
|
mem_cgroup_exit_user_fault();
|
|
/*
|
|
--- a/mm/mm_init.c
|
|
+++ b/mm/mm_init.c
|
|
@@ -65,14 +65,16 @@ void __init mminit_verify_pageflags_layo
|
|
|
|
shift = 8 * sizeof(unsigned long);
|
|
width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH
|
|
- - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH;
|
|
+ - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH - LRU_GEN_WIDTH - LRU_REFS_WIDTH;
|
|
mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths",
|
|
- "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Flags %d\n",
|
|
+ "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Gen %d Tier %d Flags %d\n",
|
|
SECTIONS_WIDTH,
|
|
NODES_WIDTH,
|
|
ZONES_WIDTH,
|
|
LAST_CPUPID_WIDTH,
|
|
KASAN_TAG_WIDTH,
|
|
+ LRU_GEN_WIDTH,
|
|
+ LRU_REFS_WIDTH,
|
|
NR_PAGEFLAGS);
|
|
mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts",
|
|
"Section %d Node %d Zone %d Lastcpupid %d Kasantag %d\n",
|
|
--- a/mm/mmzone.c
|
|
+++ b/mm/mmzone.c
|
|
@@ -81,6 +81,8 @@ void lruvec_init(struct lruvec *lruvec)
|
|
|
|
for_each_lru(lru)
|
|
INIT_LIST_HEAD(&lruvec->lists[lru]);
|
|
+
|
|
+ lru_gen_init_lruvec(lruvec);
|
|
}
|
|
|
|
#if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS)
|
|
--- a/mm/swap.c
|
|
+++ b/mm/swap.c
|
|
@@ -446,6 +446,11 @@ void lru_cache_add(struct page *page)
|
|
VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page);
|
|
VM_BUG_ON_PAGE(PageLRU(page), page);
|
|
|
|
+ /* see the comment in lru_gen_add_page() */
|
|
+ if (lru_gen_enabled() && !PageUnevictable(page) &&
|
|
+ lru_gen_in_fault() && !(current->flags & PF_MEMALLOC))
|
|
+ SetPageActive(page);
|
|
+
|
|
get_page(page);
|
|
local_lock(&lru_pvecs.lock);
|
|
pvec = this_cpu_ptr(&lru_pvecs.lru_add);
|
|
@@ -547,7 +552,7 @@ static void lru_deactivate_file_fn(struc
|
|
|
|
static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec)
|
|
{
|
|
- if (PageActive(page) && !PageUnevictable(page)) {
|
|
+ if (!PageUnevictable(page) && (PageActive(page) || lru_gen_enabled())) {
|
|
int nr_pages = thp_nr_pages(page);
|
|
|
|
del_page_from_lru_list(page, lruvec);
|
|
@@ -661,7 +666,8 @@ void deactivate_file_page(struct page *p
|
|
*/
|
|
void deactivate_page(struct page *page)
|
|
{
|
|
- if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
|
|
+ if (PageLRU(page) && !PageUnevictable(page) &&
|
|
+ (PageActive(page) || lru_gen_enabled())) {
|
|
struct pagevec *pvec;
|
|
|
|
local_lock(&lru_pvecs.lock);
|
|
--- a/mm/vmscan.c
|
|
+++ b/mm/vmscan.c
|
|
@@ -2821,6 +2821,81 @@ static bool can_age_anon_pages(struct pg
|
|
return can_demote(pgdat->node_id, sc);
|
|
}
|
|
|
|
+#ifdef CONFIG_LRU_GEN
|
|
+
|
|
+/******************************************************************************
|
|
+ * shorthand helpers
|
|
+ ******************************************************************************/
|
|
+
|
|
+#define for_each_gen_type_zone(gen, type, zone) \
|
|
+ for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++) \
|
|
+ for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \
|
|
+ for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++)
|
|
+
|
|
+static struct lruvec __maybe_unused *get_lruvec(struct mem_cgroup *memcg, int nid)
|
|
+{
|
|
+ struct pglist_data *pgdat = NODE_DATA(nid);
|
|
+
|
|
+#ifdef CONFIG_MEMCG
|
|
+ if (memcg) {
|
|
+ struct lruvec *lruvec = &memcg->nodeinfo[nid]->lruvec;
|
|
+
|
|
+ /* for hotadd_new_pgdat() */
|
|
+ if (!lruvec->pgdat)
|
|
+ lruvec->pgdat = pgdat;
|
|
+
|
|
+ return lruvec;
|
|
+ }
|
|
+#endif
|
|
+ VM_WARN_ON_ONCE(!mem_cgroup_disabled());
|
|
+
|
|
+ return pgdat ? &pgdat->__lruvec : NULL;
|
|
+}
|
|
+
|
|
+/******************************************************************************
|
|
+ * initialization
|
|
+ ******************************************************************************/
|
|
+
|
|
+void lru_gen_init_lruvec(struct lruvec *lruvec)
|
|
+{
|
|
+ int gen, type, zone;
|
|
+ struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
|
+
|
|
+ lrugen->max_seq = MIN_NR_GENS + 1;
|
|
+
|
|
+ for_each_gen_type_zone(gen, type, zone)
|
|
+ INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]);
|
|
+}
|
|
+
|
|
+#ifdef CONFIG_MEMCG
|
|
+void lru_gen_init_memcg(struct mem_cgroup *memcg)
|
|
+{
|
|
+}
|
|
+
|
|
+void lru_gen_exit_memcg(struct mem_cgroup *memcg)
|
|
+{
|
|
+ int nid;
|
|
+
|
|
+ for_each_node(nid) {
|
|
+ struct lruvec *lruvec = get_lruvec(memcg, nid);
|
|
+
|
|
+ VM_WARN_ON_ONCE(memchr_inv(lruvec->lrugen.nr_pages, 0,
|
|
+ sizeof(lruvec->lrugen.nr_pages)));
|
|
+ }
|
|
+}
|
|
+#endif
|
|
+
|
|
+static int __init init_lru_gen(void)
|
|
+{
|
|
+ BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS);
|
|
+ BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS);
|
|
+
|
|
+ return 0;
|
|
+};
|
|
+late_initcall(init_lru_gen);
|
|
+
|
|
+#endif /* CONFIG_LRU_GEN */
|
|
+
|
|
static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
|
|
{
|
|
unsigned long nr[NR_LRU_LISTS];
|