mirror of
https://github.com/openwrt/openwrt.git
synced 2024-12-20 22:23:27 +00:00
05158082f6
Backport a preliminary version of Yu Zhao's multi-generational LRU, for improved memory management. Refresh the patches while at it. Signed-off-by: Rui Salvaterra <rsalvaterra@gmail.com>
1003 lines
30 KiB
Diff
1003 lines
30 KiB
Diff
From f4b881ce07ccb2a519f664afaa2a68225b612ca3 Mon Sep 17 00:00:00 2001
|
|
From: Yu Zhao <yuzhao@google.com>
|
|
Date: Tue, 29 Jun 2021 20:46:47 -0600
|
|
Subject: [PATCH 07/10] mm: multigenerational lru: eviction
|
|
|
|
The eviction consumes old generations. Given an lruvec, the eviction
|
|
scans pages on lrugen->lists indexed by anon and file min_seq[]
|
|
(modulo MAX_NR_GENS). It first tries to select a type based on the
|
|
values of min_seq[]. If they are equal, it selects the type that has
|
|
a lower refaulted %. The eviction sorts a page according to its
|
|
updated generation number if the aging has found this page accessed.
|
|
It also moves a page to the next generation if this page is from an
|
|
upper tier that has a higher refaulted % than the base tier. The
|
|
eviction increments min_seq[] of a selected type when it finds
|
|
lrugen->lists indexed by min_seq[] of this selected type are empty.
|
|
|
|
Each generation is divided into multiple tiers. Tiers represent
|
|
different ranges of numbers of accesses from file descriptors only.
|
|
Pages accessed N times via file descriptors belong to tier
|
|
order_base_2(N). Each generation contains at most MAX_NR_TIERS tiers,
|
|
and they require additional MAX_NR_TIERS-2 bits in page->flags. In
|
|
contrast to moving between generations which requires list operations,
|
|
moving between tiers only involves operations on page->flags and
|
|
therefore has a negligible cost. A feedback loop modeled after the PID
|
|
controller monitors refaulted % across all tiers and decides when to
|
|
protect pages from which tiers.
|
|
|
|
Unmapped pages are initially added to the oldest generation and then
|
|
conditionally protected by tiers. Each tier keeps track of how many
|
|
pages from it have refaulted. Tier 0 is the base tier and pages from
|
|
it are evicted unconditionally because there are no better candidates.
|
|
Pages from an upper tier are either evicted or moved to the next
|
|
generation, depending on whether this upper tier has a higher
|
|
refaulted % than the base tier. This model has the following
|
|
advantages:
|
|
1) It removes the cost in the buffered access path and reduces the
|
|
overall cost of protection because pages are conditionally protected
|
|
in the reclaim path.
|
|
2) It takes mapped pages into account and avoids overprotecting
|
|
pages accessed multiple times via file descriptors.
|
|
3 Additional tiers improve the protection of pages accessed more
|
|
than twice.
|
|
|
|
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
|
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
|
|
Change-Id: I64c06d8f2cdb83ac7d56c7e1d07f043483956cac
|
|
---
|
|
include/linux/mm_inline.h | 10 +
|
|
include/linux/mmzone.h | 33 +++
|
|
mm/swap.c | 42 +++
|
|
mm/vmscan.c | 555 +++++++++++++++++++++++++++++++++++++-
|
|
mm/workingset.c | 120 ++++++++-
|
|
5 files changed, 757 insertions(+), 3 deletions(-)
|
|
|
|
--- a/include/linux/mm_inline.h
|
|
+++ b/include/linux/mm_inline.h
|
|
@@ -106,6 +106,14 @@ static inline int lru_hist_from_seq(unsi
|
|
return seq % NR_HIST_GENS;
|
|
}
|
|
|
|
+/* Convert the number of accesses to a tier. See the comment on MAX_NR_TIERS. */
|
|
+static inline int lru_tier_from_refs(int refs)
|
|
+{
|
|
+ VM_BUG_ON(refs > BIT(LRU_REFS_WIDTH));
|
|
+
|
|
+ return order_base_2(refs + 1);
|
|
+}
|
|
+
|
|
/* The youngest and the second youngest generations are counted as active. */
|
|
static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen)
|
|
{
|
|
@@ -226,6 +234,8 @@ static inline bool lru_gen_del_page(stru
|
|
gen = ((new_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
|
|
|
|
new_flags &= ~LRU_GEN_MASK;
|
|
+ if ((new_flags & LRU_REFS_FLAGS) != LRU_REFS_FLAGS)
|
|
+ new_flags &= ~(LRU_REFS_MASK | LRU_REFS_FLAGS);
|
|
/* for shrink_page_list() */
|
|
if (reclaiming)
|
|
new_flags &= ~(BIT(PG_referenced) | BIT(PG_reclaim));
|
|
--- a/include/linux/mmzone.h
|
|
+++ b/include/linux/mmzone.h
|
|
@@ -319,6 +319,30 @@ struct page_vma_mapped_walk;
|
|
#define MIN_NR_GENS 2
|
|
#define MAX_NR_GENS ((unsigned int)CONFIG_NR_LRU_GENS)
|
|
|
|
+/*
|
|
+ * Each generation is divided into multiple tiers. Tiers represent different
|
|
+ * ranges of numbers of accesses from file descriptors, i.e.,
|
|
+ * mark_page_accessed(). In contrast to moving between generations which
|
|
+ * requires the lru lock, moving between tiers only involves an atomic
|
|
+ * operation on page->flags and therefore has a negligible cost.
|
|
+ *
|
|
+ * The purposes of tiers are to:
|
|
+ * 1) estimate whether pages accessed multiple times via file descriptors are
|
|
+ * more active than pages accessed only via page tables by separating the two
|
|
+ * access types into upper tiers and the base tier, and comparing refaulted %
|
|
+ * across all tiers.
|
|
+ * 2) improve buffered io performance by deferring the protection of pages
|
|
+ * accessed multiple times until the eviction. That is the protection happens
|
|
+ * in the reclaim path, not the access path.
|
|
+ *
|
|
+ * Pages accessed N times via file descriptors belong to tier order_base_2(N).
|
|
+ * The base tier may be marked by PageReferenced(). All upper tiers are marked
|
|
+ * by PageReferenced() && PageWorkingset(). Additional bits from page->flags are
|
|
+ * used to support more than one upper tier.
|
|
+ */
|
|
+#define MAX_NR_TIERS ((unsigned int)CONFIG_TIERS_PER_GEN)
|
|
+#define LRU_REFS_FLAGS (BIT(PG_referenced) | BIT(PG_workingset))
|
|
+
|
|
/* Whether to keep stats for historical generations. */
|
|
#ifdef CONFIG_LRU_GEN_STATS
|
|
#define NR_HIST_GENS ((unsigned int)CONFIG_NR_LRU_GENS)
|
|
@@ -337,6 +361,15 @@ struct lrugen {
|
|
struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
|
|
/* the sizes of the multigenerational lru lists in pages */
|
|
unsigned long sizes[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
|
|
+ /* the exponential moving average of refaulted */
|
|
+ unsigned long avg_refaulted[ANON_AND_FILE][MAX_NR_TIERS];
|
|
+ /* the exponential moving average of protected+evicted */
|
|
+ unsigned long avg_total[ANON_AND_FILE][MAX_NR_TIERS];
|
|
+ /* the base tier isn't protected, hence the minus one */
|
|
+ unsigned long protected[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS - 1];
|
|
+ /* incremented without holding the lru lock */
|
|
+ atomic_long_t evicted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
|
|
+ atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
|
|
/* whether the multigenerational lru is enabled */
|
|
bool enabled[ANON_AND_FILE];
|
|
};
|
|
--- a/mm/swap.c
|
|
+++ b/mm/swap.c
|
|
@@ -389,6 +389,43 @@ static void __lru_cache_activate_page(st
|
|
local_unlock(&lru_pvecs.lock);
|
|
}
|
|
|
|
+#ifdef CONFIG_LRU_GEN
|
|
+static void page_inc_refs(struct page *page)
|
|
+{
|
|
+ unsigned long refs;
|
|
+ unsigned long old_flags, new_flags;
|
|
+
|
|
+ if (PageUnevictable(page))
|
|
+ return;
|
|
+
|
|
+ /* see the comment on MAX_NR_TIERS */
|
|
+ do {
|
|
+ new_flags = old_flags = READ_ONCE(page->flags);
|
|
+
|
|
+ if (!(new_flags & BIT(PG_referenced))) {
|
|
+ new_flags |= BIT(PG_referenced);
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ if (!(new_flags & BIT(PG_workingset))) {
|
|
+ new_flags |= BIT(PG_workingset);
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ refs = new_flags & LRU_REFS_MASK;
|
|
+ refs = min(refs + BIT(LRU_REFS_PGOFF), LRU_REFS_MASK);
|
|
+
|
|
+ new_flags &= ~LRU_REFS_MASK;
|
|
+ new_flags |= refs;
|
|
+ } while (new_flags != old_flags &&
|
|
+ cmpxchg(&page->flags, old_flags, new_flags) != old_flags);
|
|
+}
|
|
+#else
|
|
+static void page_inc_refs(struct page *page)
|
|
+{
|
|
+}
|
|
+#endif /* CONFIG_LRU_GEN */
|
|
+
|
|
/*
|
|
* Mark a page as having seen activity.
|
|
*
|
|
@@ -403,6 +440,11 @@ void mark_page_accessed(struct page *pag
|
|
{
|
|
page = compound_head(page);
|
|
|
|
+ if (lru_gen_enabled()) {
|
|
+ page_inc_refs(page);
|
|
+ return;
|
|
+ }
|
|
+
|
|
if (!PageReferenced(page)) {
|
|
SetPageReferenced(page);
|
|
} else if (PageUnevictable(page)) {
|
|
--- a/mm/vmscan.c
|
|
+++ b/mm/vmscan.c
|
|
@@ -1145,9 +1145,11 @@ static int __remove_mapping(struct addre
|
|
|
|
if (PageSwapCache(page)) {
|
|
swp_entry_t swap = { .val = page_private(page) };
|
|
- mem_cgroup_swapout(page, swap);
|
|
+
|
|
+ /* get a shadow entry before page_memcg() is cleared */
|
|
if (reclaimed && !mapping_exiting(mapping))
|
|
shadow = workingset_eviction(page, target_memcg);
|
|
+ mem_cgroup_swapout(page, swap);
|
|
__delete_from_swap_cache(page, swap, shadow);
|
|
xa_unlock_irq(&mapping->i_pages);
|
|
put_swap_page(page, swap);
|
|
@@ -1410,6 +1412,11 @@ retry:
|
|
if (!sc->may_unmap && page_mapped(page))
|
|
goto keep_locked;
|
|
|
|
+ /* lru_gen_look_around() has updated this page? */
|
|
+ if (lru_gen_enabled() && !ignore_references &&
|
|
+ page_mapped(page) && PageReferenced(page))
|
|
+ goto keep_locked;
|
|
+
|
|
may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
|
|
(PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
|
|
|
|
@@ -2570,6 +2577,9 @@ static void prepare_scan_count(pg_data_t
|
|
unsigned long file;
|
|
struct lruvec *target_lruvec;
|
|
|
|
+ if (lru_gen_enabled())
|
|
+ return;
|
|
+
|
|
target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
|
|
|
|
/*
|
|
@@ -2910,6 +2920,17 @@ static int page_lru_gen(struct page *pag
|
|
return ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
|
|
}
|
|
|
|
+static int page_lru_tier(struct page *page)
|
|
+{
|
|
+ int refs;
|
|
+ unsigned long flags = READ_ONCE(page->flags);
|
|
+
|
|
+ refs = (flags & LRU_REFS_FLAGS) == LRU_REFS_FLAGS ?
|
|
+ ((flags & LRU_REFS_MASK) >> LRU_REFS_PGOFF) + 1 : 0;
|
|
+
|
|
+ return lru_tier_from_refs(refs);
|
|
+}
|
|
+
|
|
static int get_swappiness(struct mem_cgroup *memcg)
|
|
{
|
|
return mem_cgroup_get_nr_swap_pages(memcg) >= MIN_BATCH_SIZE ?
|
|
@@ -3246,6 +3267,91 @@ done:
|
|
}
|
|
|
|
/******************************************************************************
|
|
+ * refault feedback loop
|
|
+ ******************************************************************************/
|
|
+
|
|
+/*
|
|
+ * A feedback loop modeled after the PID controller. Currently supports the
|
|
+ * proportional (P) and the integral (I) terms; the derivative (D) term can be
|
|
+ * added if necessary. The setpoint (SP) is the desired position; the process
|
|
+ * variable (PV) is the measured position. The error is the difference between
|
|
+ * the SP and the PV. A positive error results in a positive control output
|
|
+ * correction, which, in our case, is to allow eviction.
|
|
+ *
|
|
+ * The P term is refaulted % of the current generation being evicted. The I
|
|
+ * term is the exponential moving average of refaulted % of previously evicted
|
|
+ * generations, using the smoothing factor 1/2.
|
|
+ *
|
|
+ * Our goal is to maintain proportional refaulted % across all tiers.
|
|
+ */
|
|
+struct ctrl_pos {
|
|
+ unsigned long refaulted;
|
|
+ unsigned long total;
|
|
+ int gain;
|
|
+};
|
|
+
|
|
+static void read_ctrl_pos(struct lruvec *lruvec, int type, int tier, int gain,
|
|
+ struct ctrl_pos *pos)
|
|
+{
|
|
+ struct lrugen *lrugen = &lruvec->evictable;
|
|
+ int hist = lru_hist_from_seq(lrugen->min_seq[type]);
|
|
+
|
|
+ pos->refaulted = lrugen->avg_refaulted[type][tier] +
|
|
+ atomic_long_read(&lrugen->refaulted[hist][type][tier]);
|
|
+ pos->total = lrugen->avg_total[type][tier] +
|
|
+ atomic_long_read(&lrugen->evicted[hist][type][tier]);
|
|
+ if (tier)
|
|
+ pos->total += lrugen->protected[hist][type][tier - 1];
|
|
+ pos->gain = gain;
|
|
+}
|
|
+
|
|
+static void reset_ctrl_pos(struct lruvec *lruvec, int gen, int type)
|
|
+{
|
|
+ int tier;
|
|
+ int hist = lru_hist_from_seq(gen);
|
|
+ struct lrugen *lrugen = &lruvec->evictable;
|
|
+ bool carryover = gen == lru_gen_from_seq(lrugen->min_seq[type]);
|
|
+ bool clear = carryover ? NR_HIST_GENS == 1 : NR_HIST_GENS > 1;
|
|
+
|
|
+ if (!carryover && !clear)
|
|
+ return;
|
|
+
|
|
+ for (tier = 0; tier < MAX_NR_TIERS; tier++) {
|
|
+ if (carryover) {
|
|
+ unsigned long sum;
|
|
+
|
|
+ sum = lrugen->avg_refaulted[type][tier] +
|
|
+ atomic_long_read(&lrugen->refaulted[hist][type][tier]);
|
|
+ WRITE_ONCE(lrugen->avg_refaulted[type][tier], sum / 2);
|
|
+
|
|
+ sum = lrugen->avg_total[type][tier] +
|
|
+ atomic_long_read(&lrugen->evicted[hist][type][tier]);
|
|
+ if (tier)
|
|
+ sum += lrugen->protected[hist][type][tier - 1];
|
|
+ WRITE_ONCE(lrugen->avg_total[type][tier], sum / 2);
|
|
+ }
|
|
+
|
|
+ if (clear) {
|
|
+ atomic_long_set(&lrugen->refaulted[hist][type][tier], 0);
|
|
+ atomic_long_set(&lrugen->evicted[hist][type][tier], 0);
|
|
+ if (tier)
|
|
+ WRITE_ONCE(lrugen->protected[hist][type][tier - 1], 0);
|
|
+ }
|
|
+ }
|
|
+}
|
|
+
|
|
+static bool positive_ctrl_err(struct ctrl_pos *sp, struct ctrl_pos *pv)
|
|
+{
|
|
+ /*
|
|
+ * Allow eviction if the PV has a limited number of refaulted pages or a
|
|
+ * lower refaulted % than the SP.
|
|
+ */
|
|
+ return pv->refaulted < MIN_BATCH_SIZE ||
|
|
+ pv->refaulted * max(sp->total, 1UL) * sp->gain <=
|
|
+ sp->refaulted * max(pv->total, 1UL) * pv->gain;
|
|
+}
|
|
+
|
|
+/******************************************************************************
|
|
* the aging
|
|
******************************************************************************/
|
|
|
|
@@ -3265,6 +3371,7 @@ static int page_update_gen(struct page *
|
|
|
|
new_flags &= ~LRU_GEN_MASK;
|
|
new_flags |= (gen + 1UL) << LRU_GEN_PGOFF;
|
|
+ new_flags &= ~(LRU_REFS_MASK | LRU_REFS_FLAGS);
|
|
} while (new_flags != old_flags &&
|
|
cmpxchg(&page->flags, old_flags, new_flags) != old_flags);
|
|
|
|
@@ -3296,6 +3403,7 @@ static void page_inc_gen(struct page *pa
|
|
|
|
new_flags &= ~LRU_GEN_MASK;
|
|
new_flags |= (new_gen + 1UL) << LRU_GEN_PGOFF;
|
|
+ new_flags &= ~(LRU_REFS_MASK | LRU_REFS_FLAGS);
|
|
/* for end_page_writeback() */
|
|
if (reclaiming)
|
|
new_flags |= BIT(PG_reclaim);
|
|
@@ -3787,6 +3895,7 @@ static bool inc_min_seq(struct lruvec *l
|
|
}
|
|
}
|
|
|
|
+ reset_ctrl_pos(lruvec, gen, type);
|
|
WRITE_ONCE(lrugen->min_seq[type], lrugen->min_seq[type] + 1);
|
|
|
|
return true;
|
|
@@ -3824,6 +3933,8 @@ next:
|
|
if (min_seq[type] == lrugen->min_seq[type])
|
|
continue;
|
|
|
|
+ gen = lru_gen_from_seq(lrugen->min_seq[type]);
|
|
+ reset_ctrl_pos(lruvec, gen, type);
|
|
WRITE_ONCE(lrugen->min_seq[type], min_seq[type]);
|
|
success = true;
|
|
}
|
|
@@ -3885,6 +3996,9 @@ static void inc_max_seq(struct lruvec *l
|
|
}
|
|
}
|
|
|
|
+ for (type = 0; type < ANON_AND_FILE; type++)
|
|
+ reset_ctrl_pos(lruvec, gen, type);
|
|
+
|
|
WRITE_ONCE(lrugen->timestamps[gen], jiffies);
|
|
/* make sure all preceding modifications appear first */
|
|
smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1);
|
|
@@ -4166,6 +4280,433 @@ void lru_gen_look_around(struct page_vma
|
|
}
|
|
|
|
/******************************************************************************
|
|
+ * the eviction
|
|
+ ******************************************************************************/
|
|
+
|
|
+static bool sort_page(struct page *page, struct lruvec *lruvec, int tier_idx)
|
|
+{
|
|
+ bool success;
|
|
+ int gen = page_lru_gen(page);
|
|
+ int type = page_is_file_lru(page);
|
|
+ int zone = page_zonenum(page);
|
|
+ int tier = page_lru_tier(page);
|
|
+ int delta = thp_nr_pages(page);
|
|
+ struct lrugen *lrugen = &lruvec->evictable;
|
|
+
|
|
+ VM_BUG_ON_PAGE(gen >= MAX_NR_GENS, page);
|
|
+
|
|
+ /* an mlocked page? */
|
|
+ if (!page_evictable(page)) {
|
|
+ success = lru_gen_del_page(page, lruvec, true);
|
|
+ VM_BUG_ON_PAGE(!success, page);
|
|
+ SetPageUnevictable(page);
|
|
+ add_page_to_lru_list(page, lruvec);
|
|
+ __count_vm_events(UNEVICTABLE_PGCULLED, delta);
|
|
+ return true;
|
|
+ }
|
|
+
|
|
+ /* a lazy-free page that has been written into? */
|
|
+ if (type && PageDirty(page) && PageAnon(page)) {
|
|
+ success = lru_gen_del_page(page, lruvec, true);
|
|
+ VM_BUG_ON_PAGE(!success, page);
|
|
+ SetPageSwapBacked(page);
|
|
+ add_page_to_lru_list_tail(page, lruvec);
|
|
+ return true;
|
|
+ }
|
|
+
|
|
+ /* page_update_gen() has updated this page? */
|
|
+ if (gen != lru_gen_from_seq(lrugen->min_seq[type])) {
|
|
+ list_move(&page->lru, &lrugen->lists[gen][type][zone]);
|
|
+ return true;
|
|
+ }
|
|
+
|
|
+ /* protect this page if its tier has a higher refaulted % */
|
|
+ if (tier > tier_idx) {
|
|
+ int hist = lru_hist_from_seq(gen);
|
|
+
|
|
+ page_inc_gen(page, lruvec, false);
|
|
+ WRITE_ONCE(lrugen->protected[hist][type][tier - 1],
|
|
+ lrugen->protected[hist][type][tier - 1] + delta);
|
|
+ __mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type, delta);
|
|
+ return true;
|
|
+ }
|
|
+
|
|
+ /* mark this page for reclaim if it's pending writeback */
|
|
+ if (PageWriteback(page) || (type && PageDirty(page))) {
|
|
+ page_inc_gen(page, lruvec, true);
|
|
+ return true;
|
|
+ }
|
|
+
|
|
+ return false;
|
|
+}
|
|
+
|
|
+static bool isolate_page(struct page *page, struct lruvec *lruvec, struct scan_control *sc)
|
|
+{
|
|
+ bool success;
|
|
+
|
|
+ if (!sc->may_unmap && page_mapped(page))
|
|
+ return false;
|
|
+
|
|
+ if (!(sc->may_writepage && (sc->gfp_mask & __GFP_IO)) &&
|
|
+ (PageDirty(page) || (PageAnon(page) && !PageSwapCache(page))))
|
|
+ return false;
|
|
+
|
|
+ if (!get_page_unless_zero(page))
|
|
+ return false;
|
|
+
|
|
+ if (!TestClearPageLRU(page)) {
|
|
+ put_page(page);
|
|
+ return false;
|
|
+ }
|
|
+
|
|
+ success = lru_gen_del_page(page, lruvec, true);
|
|
+ VM_BUG_ON_PAGE(!success, page);
|
|
+
|
|
+ return true;
|
|
+}
|
|
+
|
|
+static int scan_pages(struct lruvec *lruvec, struct scan_control *sc,
|
|
+ int type, int tier, struct list_head *list)
|
|
+{
|
|
+ int gen, zone;
|
|
+ enum vm_event_item item;
|
|
+ int sorted = 0;
|
|
+ int scanned = 0;
|
|
+ int isolated = 0;
|
|
+ int remaining = MAX_BATCH_SIZE;
|
|
+ struct lrugen *lrugen = &lruvec->evictable;
|
|
+ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
|
|
+
|
|
+ VM_BUG_ON(!list_empty(list));
|
|
+
|
|
+ if (get_nr_gens(lruvec, type) == MIN_NR_GENS)
|
|
+ return 0;
|
|
+
|
|
+ gen = lru_gen_from_seq(lrugen->min_seq[type]);
|
|
+
|
|
+ for (zone = sc->reclaim_idx; zone >= 0; zone--) {
|
|
+ LIST_HEAD(moved);
|
|
+ int skipped = 0;
|
|
+ struct list_head *head = &lrugen->lists[gen][type][zone];
|
|
+
|
|
+ while (!list_empty(head)) {
|
|
+ struct page *page = lru_to_page(head);
|
|
+ int delta = thp_nr_pages(page);
|
|
+
|
|
+ VM_BUG_ON_PAGE(PageTail(page), page);
|
|
+ VM_BUG_ON_PAGE(PageUnevictable(page), page);
|
|
+ VM_BUG_ON_PAGE(PageActive(page), page);
|
|
+ VM_BUG_ON_PAGE(page_is_file_lru(page) != type, page);
|
|
+ VM_BUG_ON_PAGE(page_zonenum(page) != zone, page);
|
|
+
|
|
+ prefetchw_prev_lru_page(page, head, flags);
|
|
+
|
|
+ scanned += delta;
|
|
+
|
|
+ if (sort_page(page, lruvec, tier))
|
|
+ sorted += delta;
|
|
+ else if (isolate_page(page, lruvec, sc)) {
|
|
+ list_add(&page->lru, list);
|
|
+ isolated += delta;
|
|
+ } else {
|
|
+ list_move(&page->lru, &moved);
|
|
+ skipped += delta;
|
|
+ }
|
|
+
|
|
+ if (!--remaining || max(isolated, skipped) >= MIN_BATCH_SIZE)
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ if (skipped) {
|
|
+ list_splice(&moved, head);
|
|
+ __count_zid_vm_events(PGSCAN_SKIP, zone, skipped);
|
|
+ }
|
|
+
|
|
+ if (!remaining || isolated >= MIN_BATCH_SIZE)
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT;
|
|
+ if (!cgroup_reclaim(sc)) {
|
|
+ __count_vm_events(item, isolated);
|
|
+ __count_vm_events(PGREFILL, sorted);
|
|
+ }
|
|
+ __count_memcg_events(memcg, item, isolated);
|
|
+ __count_memcg_events(memcg, PGREFILL, sorted);
|
|
+ __count_vm_events(PGSCAN_ANON + type, isolated);
|
|
+
|
|
+ /*
|
|
+ * We may have trouble finding eligible pages due to reclaim_idx,
|
|
+ * may_unmap and may_writepage. Check `remaining` to make sure we won't
|
|
+ * be stuck if we aren't making enough progress.
|
|
+ */
|
|
+ return isolated || !remaining ? scanned : 0;
|
|
+}
|
|
+
|
|
+static int get_tier_idx(struct lruvec *lruvec, int type)
|
|
+{
|
|
+ int tier;
|
|
+ struct ctrl_pos sp, pv;
|
|
+
|
|
+ /*
|
|
+ * Ideally we don't want to evict upper tiers that have higher refaulted
|
|
+ * %. However, we need to leave a margin for the fluctuation in
|
|
+ * refaulted %. So we use a larger gain factor to make sure upper tiers
|
|
+ * are indeed more active. We choose 2 because the lowest upper tier
|
|
+ * would have twice of refaulted % of the base tier, according to their
|
|
+ * numbers of accesses.
|
|
+ */
|
|
+ read_ctrl_pos(lruvec, type, 0, 1, &sp);
|
|
+ for (tier = 1; tier < MAX_NR_TIERS; tier++) {
|
|
+ read_ctrl_pos(lruvec, type, tier, 2, &pv);
|
|
+ if (!positive_ctrl_err(&sp, &pv))
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ return tier - 1;
|
|
+}
|
|
+
|
|
+static int get_type_to_scan(struct lruvec *lruvec, int swappiness, int *tier_idx)
|
|
+{
|
|
+ int type, tier;
|
|
+ struct ctrl_pos sp, pv;
|
|
+ int gain[ANON_AND_FILE] = { swappiness, 200 - swappiness };
|
|
+
|
|
+ /*
|
|
+ * Compare refaulted % between the base tiers of anon and file to
|
|
+ * determine which type to evict. Also need to compare refaulted % of
|
|
+ * the upper tiers of the selected type with that of the base tier of
|
|
+ * the other type to determine which tier of the selected type to evict.
|
|
+ */
|
|
+ read_ctrl_pos(lruvec, 0, 0, gain[0], &sp);
|
|
+ read_ctrl_pos(lruvec, 1, 0, gain[1], &pv);
|
|
+ type = positive_ctrl_err(&sp, &pv);
|
|
+
|
|
+ read_ctrl_pos(lruvec, !type, 0, gain[!type], &sp);
|
|
+ for (tier = 1; tier < MAX_NR_TIERS; tier++) {
|
|
+ read_ctrl_pos(lruvec, type, tier, gain[type], &pv);
|
|
+ if (!positive_ctrl_err(&sp, &pv))
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ *tier_idx = tier - 1;
|
|
+
|
|
+ return type;
|
|
+}
|
|
+
|
|
+static int isolate_pages(struct lruvec *lruvec, struct scan_control *sc, int swappiness,
|
|
+ int *type_scanned, struct list_head *list)
|
|
+{
|
|
+ int i;
|
|
+ int type;
|
|
+ int scanned;
|
|
+ int tier = -1;
|
|
+ DEFINE_MIN_SEQ(lruvec);
|
|
+
|
|
+ VM_BUG_ON(!seq_is_valid(lruvec));
|
|
+
|
|
+ /*
|
|
+ * Try to select a type based on generations and swappiness, and if that
|
|
+ * fails, fall back to get_type_to_scan(). When anon and file are both
|
|
+ * available from the same generation, swappiness 200 is interpreted as
|
|
+ * anon first and swappiness 1 is interpreted as file first.
|
|
+ */
|
|
+ if (!swappiness)
|
|
+ type = 1;
|
|
+ else if (min_seq[0] < min_seq[1])
|
|
+ type = 0;
|
|
+ else if (swappiness == 1)
|
|
+ type = 1;
|
|
+ else if (swappiness == 200)
|
|
+ type = 0;
|
|
+ else
|
|
+ type = get_type_to_scan(lruvec, swappiness, &tier);
|
|
+
|
|
+ for (i = !swappiness; i < ANON_AND_FILE; i++) {
|
|
+ if (tier < 0)
|
|
+ tier = get_tier_idx(lruvec, type);
|
|
+
|
|
+ scanned = scan_pages(lruvec, sc, type, tier, list);
|
|
+ if (scanned)
|
|
+ break;
|
|
+
|
|
+ type = !type;
|
|
+ tier = -1;
|
|
+ }
|
|
+
|
|
+ *type_scanned = type;
|
|
+
|
|
+ return scanned;
|
|
+}
|
|
+
|
|
+/* Main function used by the foreground, the background and the user-triggered eviction. */
|
|
+static int evict_pages(struct lruvec *lruvec, struct scan_control *sc, int swappiness)
|
|
+{
|
|
+ int type;
|
|
+ int scanned;
|
|
+ int reclaimed;
|
|
+ LIST_HEAD(list);
|
|
+ struct page *page;
|
|
+ enum vm_event_item item;
|
|
+ struct reclaim_stat stat;
|
|
+ struct mm_walk_args *args;
|
|
+ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
|
|
+ struct pglist_data *pgdat = lruvec_pgdat(lruvec);
|
|
+
|
|
+ spin_lock_irq(&lruvec->lru_lock);
|
|
+
|
|
+ scanned = isolate_pages(lruvec, sc, swappiness, &type, &list);
|
|
+
|
|
+ if (try_to_inc_min_seq(lruvec, swappiness))
|
|
+ scanned++;
|
|
+
|
|
+ if (get_nr_gens(lruvec, 1) == MIN_NR_GENS)
|
|
+ scanned = 0;
|
|
+
|
|
+ spin_unlock_irq(&lruvec->lru_lock);
|
|
+
|
|
+ if (list_empty(&list))
|
|
+ return scanned;
|
|
+
|
|
+ reclaimed = shrink_page_list(&list, pgdat, sc, &stat, false);
|
|
+ /*
|
|
+ * We need to prevent rejected pages from being added back to the same
|
|
+ * lists they were isolated from. Otherwise we may risk looping on them
|
|
+ * forever.
|
|
+ */
|
|
+ list_for_each_entry(page, &list, lru) {
|
|
+ if (!PageReclaim(page) || !(PageDirty(page) || PageWriteback(page)))
|
|
+ SetPageActive(page);
|
|
+
|
|
+ ClearPageReferenced(page);
|
|
+ ClearPageWorkingset(page);
|
|
+ }
|
|
+
|
|
+ spin_lock_irq(&lruvec->lru_lock);
|
|
+
|
|
+ move_pages_to_lru(lruvec, &list);
|
|
+
|
|
+ args = current->reclaim_state ? current->reclaim_state->mm_walk_args : NULL;
|
|
+ if (args && args->batch_size)
|
|
+ reset_batch_size(lruvec, args);
|
|
+
|
|
+ item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT;
|
|
+ if (!cgroup_reclaim(sc))
|
|
+ __count_vm_events(item, reclaimed);
|
|
+ __count_memcg_events(memcg, item, reclaimed);
|
|
+ __count_vm_events(PGSTEAL_ANON + type, reclaimed);
|
|
+
|
|
+ spin_unlock_irq(&lruvec->lru_lock);
|
|
+
|
|
+ mem_cgroup_uncharge_list(&list);
|
|
+ free_unref_page_list(&list);
|
|
+
|
|
+ sc->nr_reclaimed += reclaimed;
|
|
+
|
|
+ return scanned;
|
|
+}
|
|
+
|
|
+static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, int swappiness)
|
|
+{
|
|
+ bool low;
|
|
+ long nr_to_scan;
|
|
+ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
|
|
+ int priority = sc->priority;
|
|
+ DEFINE_MAX_SEQ(lruvec);
|
|
+ DEFINE_MIN_SEQ(lruvec);
|
|
+
|
|
+ if (mem_cgroup_below_min(memcg) ||
|
|
+ (mem_cgroup_below_low(memcg) && !sc->memcg_low_reclaim))
|
|
+ return 0;
|
|
+
|
|
+ if (sc->nr_reclaimed >= sc->nr_to_reclaim) {
|
|
+ priority = DEF_PRIORITY;
|
|
+ sc->force_deactivate = 0;
|
|
+ }
|
|
+
|
|
+ nr_to_scan = get_nr_evictable(lruvec, sc, swappiness, max_seq, min_seq, &low);
|
|
+ if (!nr_to_scan)
|
|
+ return 0;
|
|
+
|
|
+ nr_to_scan >>= priority;
|
|
+
|
|
+ if (!mem_cgroup_online(memcg))
|
|
+ nr_to_scan++;
|
|
+
|
|
+ if (!nr_to_scan)
|
|
+ return 0;
|
|
+
|
|
+ if (current_is_kswapd()) {
|
|
+ /* leave the work to lru_gen_age_node() */
|
|
+ if (max_seq - min_seq[1] < MIN_NR_GENS)
|
|
+ return 0;
|
|
+
|
|
+ if (!low)
|
|
+ sc->force_deactivate = 0;
|
|
+
|
|
+ return nr_to_scan;
|
|
+ }
|
|
+
|
|
+ if (max_seq - min_seq[1] >= MIN_NR_GENS)
|
|
+ return nr_to_scan;
|
|
+
|
|
+ /* move onto slab and other memcgs if we haven't tried them all */
|
|
+ if (!sc->force_deactivate) {
|
|
+ sc->skipped_deactivate = 1;
|
|
+ return 0;
|
|
+ }
|
|
+
|
|
+ return try_to_inc_max_seq(lruvec, sc, swappiness, max_seq, true) ? nr_to_scan : 0;
|
|
+}
|
|
+
|
|
+static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
|
|
+{
|
|
+ struct blk_plug plug;
|
|
+ long scanned = 0;
|
|
+ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
|
|
+ struct pglist_data *pgdat = lruvec_pgdat(lruvec);
|
|
+
|
|
+ lru_add_drain();
|
|
+
|
|
+ if (current_is_kswapd())
|
|
+ current->reclaim_state->mm_walk_args = &pgdat->mm_walk_args;
|
|
+
|
|
+ blk_start_plug(&plug);
|
|
+
|
|
+ while (true) {
|
|
+ int delta;
|
|
+ int swappiness;
|
|
+ long nr_to_scan;
|
|
+
|
|
+ if (sc->may_swap)
|
|
+ swappiness = get_swappiness(memcg);
|
|
+ else if (!cgroup_reclaim(sc) && get_swappiness(memcg))
|
|
+ swappiness = 1;
|
|
+ else
|
|
+ swappiness = 0;
|
|
+
|
|
+ nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness);
|
|
+ if (!nr_to_scan)
|
|
+ break;
|
|
+
|
|
+ delta = evict_pages(lruvec, sc, swappiness);
|
|
+ if (!delta)
|
|
+ break;
|
|
+
|
|
+ scanned += delta;
|
|
+ if (scanned >= nr_to_scan)
|
|
+ break;
|
|
+
|
|
+ cond_resched();
|
|
+ }
|
|
+
|
|
+ blk_finish_plug(&plug);
|
|
+
|
|
+ if (current_is_kswapd())
|
|
+ current->reclaim_state->mm_walk_args = NULL;
|
|
+}
|
|
+
|
|
+/******************************************************************************
|
|
* state change
|
|
******************************************************************************/
|
|
|
|
@@ -4420,6 +4961,10 @@ static void lru_gen_age_node(struct pgli
|
|
{
|
|
}
|
|
|
|
+static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
|
|
+{
|
|
+}
|
|
+
|
|
#endif /* CONFIG_LRU_GEN */
|
|
|
|
static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
|
|
@@ -4433,6 +4978,11 @@ static void shrink_lruvec(struct lruvec
|
|
struct blk_plug plug;
|
|
bool scan_adjusted;
|
|
|
|
+ if (lru_gen_enabled()) {
|
|
+ lru_gen_shrink_lruvec(lruvec, sc);
|
|
+ return;
|
|
+ }
|
|
+
|
|
get_scan_count(lruvec, sc, nr);
|
|
|
|
/* Record the original scan target for proportional adjustments later */
|
|
@@ -4906,6 +5456,9 @@ static void snapshot_refaults(struct mem
|
|
struct lruvec *target_lruvec;
|
|
unsigned long refaults;
|
|
|
|
+ if (lru_gen_enabled())
|
|
+ return;
|
|
+
|
|
target_lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
|
|
refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_ANON);
|
|
target_lruvec->refaults[0] = refaults;
|
|
--- a/mm/workingset.c
|
|
+++ b/mm/workingset.c
|
|
@@ -187,7 +187,6 @@ static unsigned int bucket_order __read_
|
|
static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction,
|
|
bool workingset)
|
|
{
|
|
- eviction >>= bucket_order;
|
|
eviction &= EVICTION_MASK;
|
|
eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid;
|
|
eviction = (eviction << NODES_SHIFT) | pgdat->node_id;
|
|
@@ -212,10 +211,117 @@ static void unpack_shadow(void *shadow,
|
|
|
|
*memcgidp = memcgid;
|
|
*pgdat = NODE_DATA(nid);
|
|
- *evictionp = entry << bucket_order;
|
|
+ *evictionp = entry;
|
|
*workingsetp = workingset;
|
|
}
|
|
|
|
+#ifdef CONFIG_LRU_GEN
|
|
+
|
|
+static int page_lru_refs(struct page *page)
|
|
+{
|
|
+ unsigned long flags = READ_ONCE(page->flags);
|
|
+
|
|
+ BUILD_BUG_ON(LRU_GEN_WIDTH + LRU_REFS_WIDTH > BITS_PER_LONG - EVICTION_SHIFT);
|
|
+
|
|
+ /* see the comment on MAX_NR_TIERS */
|
|
+ return flags & BIT(PG_workingset) ? (flags & LRU_REFS_MASK) >> LRU_REFS_PGOFF : 0;
|
|
+}
|
|
+
|
|
+/* Return a token to be stored in the shadow entry of a page being evicted. */
|
|
+static void *lru_gen_eviction(struct page *page)
|
|
+{
|
|
+ int hist, tier;
|
|
+ unsigned long token;
|
|
+ unsigned long min_seq;
|
|
+ struct lruvec *lruvec;
|
|
+ struct lrugen *lrugen;
|
|
+ int type = page_is_file_lru(page);
|
|
+ int refs = page_lru_refs(page);
|
|
+ int delta = thp_nr_pages(page);
|
|
+ bool workingset = PageWorkingset(page);
|
|
+ struct mem_cgroup *memcg = page_memcg(page);
|
|
+ struct pglist_data *pgdat = page_pgdat(page);
|
|
+
|
|
+ lruvec = mem_cgroup_lruvec(memcg, pgdat);
|
|
+ lrugen = &lruvec->evictable;
|
|
+ min_seq = READ_ONCE(lrugen->min_seq[type]);
|
|
+ token = (min_seq << LRU_REFS_WIDTH) | refs;
|
|
+
|
|
+ hist = lru_hist_from_seq(min_seq);
|
|
+ tier = lru_tier_from_refs(refs + workingset);
|
|
+ atomic_long_add(delta, &lrugen->evicted[hist][type][tier]);
|
|
+
|
|
+ return pack_shadow(mem_cgroup_id(memcg), pgdat, token, workingset);
|
|
+}
|
|
+
|
|
+/* Count a refaulted page based on the token stored in its shadow entry. */
|
|
+static void lru_gen_refault(struct page *page, void *shadow)
|
|
+{
|
|
+ int hist, tier, refs;
|
|
+ int memcg_id;
|
|
+ bool workingset;
|
|
+ unsigned long token;
|
|
+ unsigned long min_seq;
|
|
+ struct lruvec *lruvec;
|
|
+ struct lrugen *lrugen;
|
|
+ struct mem_cgroup *memcg;
|
|
+ struct pglist_data *pgdat;
|
|
+ int type = page_is_file_lru(page);
|
|
+ int delta = thp_nr_pages(page);
|
|
+
|
|
+ unpack_shadow(shadow, &memcg_id, &pgdat, &token, &workingset);
|
|
+ if (page_pgdat(page) != pgdat)
|
|
+ return;
|
|
+
|
|
+ rcu_read_lock();
|
|
+ memcg = page_memcg_rcu(page);
|
|
+ if (mem_cgroup_id(memcg) != memcg_id)
|
|
+ goto unlock;
|
|
+
|
|
+ refs = token & (BIT(LRU_REFS_WIDTH) - 1);
|
|
+ if (refs && !workingset)
|
|
+ goto unlock;
|
|
+
|
|
+ token >>= LRU_REFS_WIDTH;
|
|
+ lruvec = mem_cgroup_lruvec(memcg, pgdat);
|
|
+ lrugen = &lruvec->evictable;
|
|
+ min_seq = READ_ONCE(lrugen->min_seq[type]);
|
|
+ if (token != (min_seq & (EVICTION_MASK >> LRU_REFS_WIDTH)))
|
|
+ goto unlock;
|
|
+
|
|
+ hist = lru_hist_from_seq(min_seq);
|
|
+ tier = lru_tier_from_refs(refs + workingset);
|
|
+ atomic_long_add(delta, &lrugen->refaulted[hist][type][tier]);
|
|
+ mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + type, delta);
|
|
+
|
|
+ /*
|
|
+ * Tiers don't offer any protection to pages accessed via page tables.
|
|
+ * That's what generations do. Tiers can't fully protect pages after
|
|
+ * their numbers of accesses has exceeded the max value. Conservatively
|
|
+ * count these two conditions as stalls even though they might not
|
|
+ * indicate any real memory pressure.
|
|
+ */
|
|
+ if (task_in_nonseq_fault() || refs + workingset == BIT(LRU_REFS_WIDTH)) {
|
|
+ SetPageWorkingset(page);
|
|
+ mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + type, delta);
|
|
+ }
|
|
+unlock:
|
|
+ rcu_read_unlock();
|
|
+}
|
|
+
|
|
+#else
|
|
+
|
|
+static void *lru_gen_eviction(struct page *page)
|
|
+{
|
|
+ return NULL;
|
|
+}
|
|
+
|
|
+static void lru_gen_refault(struct page *page, void *shadow)
|
|
+{
|
|
+}
|
|
+
|
|
+#endif /* CONFIG_LRU_GEN */
|
|
+
|
|
/**
|
|
* workingset_age_nonresident - age non-resident entries as LRU ages
|
|
* @lruvec: the lruvec that was aged
|
|
@@ -264,10 +370,14 @@ void *workingset_eviction(struct page *p
|
|
VM_BUG_ON_PAGE(page_count(page), page);
|
|
VM_BUG_ON_PAGE(!PageLocked(page), page);
|
|
|
|
+ if (lru_gen_enabled())
|
|
+ return lru_gen_eviction(page);
|
|
+
|
|
lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
|
|
/* XXX: target_memcg can be NULL, go through lruvec */
|
|
memcgid = mem_cgroup_id(lruvec_memcg(lruvec));
|
|
eviction = atomic_long_read(&lruvec->nonresident_age);
|
|
+ eviction >>= bucket_order;
|
|
workingset_age_nonresident(lruvec, thp_nr_pages(page));
|
|
return pack_shadow(memcgid, pgdat, eviction, PageWorkingset(page));
|
|
}
|
|
@@ -296,7 +406,13 @@ void workingset_refault(struct page *pag
|
|
bool workingset;
|
|
int memcgid;
|
|
|
|
+ if (lru_gen_enabled()) {
|
|
+ lru_gen_refault(page, shadow);
|
|
+ return;
|
|
+ }
|
|
+
|
|
unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &workingset);
|
|
+ eviction <<= bucket_order;
|
|
|
|
rcu_read_lock();
|
|
/*
|