kernel: add and enable MGLRU for Linux 5.15

Backport a preliminary version of Yu Zhao's multi-generational LRU, for
improved memory management. Refresh the patches while at it.

Signed-off-by: Rui Salvaterra <rsalvaterra@gmail.com>
This commit is contained in:
Rui Salvaterra 2022-03-29 17:10:48 +01:00 committed by Daniel Golle
parent 0be1b78856
commit 05158082f6
11 changed files with 5180 additions and 0 deletions

View File

@ -3195,6 +3195,9 @@ CONFIG_LOG_CPU_MAX_BUF_SHIFT=12
# CONFIG_LPC_ICH is not set
# CONFIG_LPC_SCH is not set
# CONFIG_LP_CONSOLE is not set
CONFIG_LRU_GEN=y
CONFIG_LRU_GEN_ENABLED=y
# CONFIG_LRU_GEN_STATS is not set
# CONFIG_LSI_ET1011C_PHY is not set
CONFIG_LSM="lockdown,yama,loadpin,safesetid,integrity"
CONFIG_LSM_MMAP_MIN_ADDR=65536
@ -4388,6 +4391,7 @@ CONFIG_NMI_LOG_BUF_SHIFT=13
# CONFIG_NO_HZ is not set
# CONFIG_NO_HZ_FULL is not set
# CONFIG_NO_HZ_IDLE is not set
CONFIG_NR_LRU_GENS=7
# CONFIG_NS83820 is not set
# CONFIG_NTB is not set
# CONFIG_NTFS3_64BIT_CLUSTER is not set
@ -6480,6 +6484,7 @@ CONFIG_THIN_ARCHIVES=y
# CONFIG_THUNDER_NIC_VF is not set
# CONFIG_TICK_CPU_ACCOUNTING is not set
CONFIG_TICK_ONESHOT=y
CONFIG_TIERS_PER_GEN=4
# CONFIG_TIFM_CORE is not set
# CONFIG_TIGON3 is not set
# CONFIG_TIMB_DMA is not set

View File

@ -0,0 +1,169 @@
From a8e6015d9534f39abc08e6804566af059e498a60 Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Wed, 4 Aug 2021 01:31:34 -0600
Subject: [PATCH 01/10] mm: x86, arm64: add arch_has_hw_pte_young()
Some architectures automatically set the accessed bit in PTEs, e.g.,
x86 and arm64 v8.2. On architectures that do not have this capability,
clearing the accessed bit in a PTE triggers a page fault following the
TLB miss of this PTE.
Being aware of this capability can help make better decisions, i.e.,
whether to limit the size of each batch of PTEs and the burst of
batches when clearing the accessed bit.
Signed-off-by: Yu Zhao <yuzhao@google.com>
Change-Id: Ib49b44fb56df3333a2ff1fcc496fb1980b976e7a
---
arch/arm64/include/asm/cpufeature.h | 5 +++++
arch/arm64/include/asm/pgtable.h | 13 ++++++++-----
arch/arm64/kernel/cpufeature.c | 10 ++++++++++
arch/arm64/tools/cpucaps | 1 +
arch/x86/include/asm/pgtable.h | 6 +++---
include/linux/pgtable.h | 13 +++++++++++++
mm/memory.c | 14 +-------------
7 files changed, 41 insertions(+), 21 deletions(-)
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -808,6 +808,11 @@ static inline bool system_supports_tlb_r
cpus_have_const_cap(ARM64_HAS_TLB_RANGE);
}
+static inline bool system_has_hw_af(void)
+{
+ return IS_ENABLED(CONFIG_ARM64_HW_AFDBM) && cpus_have_const_cap(ARM64_HW_AF);
+}
+
extern int do_emulate_mrs(struct pt_regs *regs, u32 sys_reg, u32 rt);
static inline u32 id_aa64mmfr0_parange_to_phys_shift(int parange)
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -999,13 +999,16 @@ static inline void update_mmu_cache(stru
* page after fork() + CoW for pfn mappings. We don't always have a
* hardware-managed access flag on arm64.
*/
-static inline bool arch_faults_on_old_pte(void)
+static inline bool arch_has_hw_pte_young(bool local)
{
- WARN_ON(preemptible());
+ if (local) {
+ WARN_ON(preemptible());
+ return cpu_has_hw_af();
+ }
- return !cpu_has_hw_af();
+ return system_has_hw_af();
}
-#define arch_faults_on_old_pte arch_faults_on_old_pte
+#define arch_has_hw_pte_young arch_has_hw_pte_young
/*
* Experimentally, it's cheap to set the access flag in hardware and we
@@ -1013,7 +1016,7 @@ static inline bool arch_faults_on_old_pt
*/
static inline bool arch_wants_old_prefaulted_pte(void)
{
- return !arch_faults_on_old_pte();
+ return arch_has_hw_pte_young(true);
}
#define arch_wants_old_prefaulted_pte arch_wants_old_prefaulted_pte
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -2184,6 +2184,16 @@ static const struct arm64_cpu_capabiliti
.matches = has_hw_dbm,
.cpu_enable = cpu_enable_hw_dbm,
},
+ {
+ .desc = "Hardware update of the Access flag",
+ .type = ARM64_CPUCAP_SYSTEM_FEATURE,
+ .capability = ARM64_HW_AF,
+ .sys_reg = SYS_ID_AA64MMFR1_EL1,
+ .sign = FTR_UNSIGNED,
+ .field_pos = ID_AA64MMFR1_HADBS_SHIFT,
+ .min_field_value = 1,
+ .matches = has_cpuid_feature,
+ },
#endif
{
.desc = "CRC32 instructions",
--- a/arch/arm64/tools/cpucaps
+++ b/arch/arm64/tools/cpucaps
@@ -35,6 +35,7 @@ HAS_STAGE2_FWB
HAS_SYSREG_GIC_CPUIF
HAS_TLB_RANGE
HAS_VIRT_HOST_EXTN
+HW_AF
HW_DBM
KVM_PROTECTED_MODE
MISMATCHED_CACHE_TYPE
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -1397,10 +1397,10 @@ static inline bool arch_has_pfn_modify_c
return boot_cpu_has_bug(X86_BUG_L1TF);
}
-#define arch_faults_on_old_pte arch_faults_on_old_pte
-static inline bool arch_faults_on_old_pte(void)
+#define arch_has_hw_pte_young arch_has_hw_pte_young
+static inline bool arch_has_hw_pte_young(bool local)
{
- return false;
+ return true;
}
#endif /* __ASSEMBLY__ */
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -259,6 +259,19 @@ static inline int pmdp_clear_flush_young
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#endif
+#ifndef arch_has_hw_pte_young
+/*
+ * Return whether the accessed bit is supported by the local CPU or all CPUs.
+ *
+ * Those arches which have hw access flag feature need to implement their own
+ * helper. By default, "false" means pagefault will be hit on old pte.
+ */
+static inline bool arch_has_hw_pte_young(bool local)
+{
+ return false;
+}
+#endif
+
#ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR
static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
unsigned long address,
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -121,18 +121,6 @@ int randomize_va_space __read_mostly =
2;
#endif
-#ifndef arch_faults_on_old_pte
-static inline bool arch_faults_on_old_pte(void)
-{
- /*
- * Those arches which don't have hw access flag feature need to
- * implement their own helper. By default, "true" means pagefault
- * will be hit on old pte.
- */
- return true;
-}
-#endif
-
#ifndef arch_wants_old_prefaulted_pte
static inline bool arch_wants_old_prefaulted_pte(void)
{
@@ -2782,7 +2770,7 @@ static inline bool cow_user_page(struct
* On architectures with software "accessed" bits, we would
* take a double page fault, so mark it accessed here.
*/
- if (arch_faults_on_old_pte() && !pte_young(vmf->orig_pte)) {
+ if (!arch_has_hw_pte_young(true) && !pte_young(vmf->orig_pte)) {
pte_t entry;
vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl);

View File

@ -0,0 +1,111 @@
From f8b663bbfa30af5515e222fd74df20ea4e8393a2 Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Sat, 26 Sep 2020 21:17:18 -0600
Subject: [PATCH 02/10] mm: x86: add CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG
Some architectures support the accessed bit on non-leaf PMD entries,
e.g., x86_64 sets the accessed bit on a non-leaf PMD entry when using
it as part of linear address translation [1]. As an optimization, page
table walkers who are interested in the accessed bit can skip the PTEs
under a non-leaf PMD entry if the accessed bit is cleared on this PMD
entry.
Although an inline function may be preferable, this capability is
added as a configuration option to look consistent when used with the
existing macros.
[1]: Intel 64 and IA-32 Architectures Software Developer's Manual
Volume 3 (June 2021), section 4.8
Signed-off-by: Yu Zhao <yuzhao@google.com>
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
Change-Id: I1a17be3ae926f721f7b17ea1539e5c39e8c4f9a8
---
arch/Kconfig | 9 +++++++++
arch/x86/Kconfig | 1 +
arch/x86/include/asm/pgtable.h | 3 ++-
arch/x86/mm/pgtable.c | 5 ++++-
include/linux/pgtable.h | 4 ++--
5 files changed, 18 insertions(+), 4 deletions(-)
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -1295,6 +1295,15 @@ config ARCH_HAS_ELFCORE_COMPAT
config ARCH_HAS_PARANOID_L1D_FLUSH
bool
+config ARCH_HAS_NONLEAF_PMD_YOUNG
+ bool
+ depends on PGTABLE_LEVELS > 2
+ help
+ Architectures that select this are able to set the accessed bit on
+ non-leaf PMD entries in addition to leaf PTE entries where pages are
+ mapped. For them, page table walkers that clear the accessed bit may
+ stop at non-leaf PMD entries if they do not see the accessed bit.
+
source "kernel/gcov/Kconfig"
source "scripts/gcc-plugins/Kconfig"
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -84,6 +84,7 @@ config X86
select ARCH_HAS_PMEM_API if X86_64
select ARCH_HAS_PTE_DEVMAP if X86_64
select ARCH_HAS_PTE_SPECIAL
+ select ARCH_HAS_NONLEAF_PMD_YOUNG if X86_64
select ARCH_HAS_UACCESS_FLUSHCACHE if X86_64
select ARCH_HAS_COPY_MC if X86_64
select ARCH_HAS_SET_MEMORY
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -817,7 +817,8 @@ static inline unsigned long pmd_page_vad
static inline int pmd_bad(pmd_t pmd)
{
- return (pmd_flags(pmd) & ~_PAGE_USER) != _KERNPG_TABLE;
+ return (pmd_flags(pmd) & ~(_PAGE_USER | _PAGE_ACCESSED)) !=
+ (_KERNPG_TABLE & ~_PAGE_ACCESSED);
}
static inline unsigned long pages_to_mb(unsigned long npg)
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -550,7 +550,7 @@ int ptep_test_and_clear_young(struct vm_
return ret;
}
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
int pmdp_test_and_clear_young(struct vm_area_struct *vma,
unsigned long addr, pmd_t *pmdp)
{
@@ -562,6 +562,9 @@ int pmdp_test_and_clear_young(struct vm_
return ret;
}
+#endif
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
int pudp_test_and_clear_young(struct vm_area_struct *vma,
unsigned long addr, pud_t *pudp)
{
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -212,7 +212,7 @@ static inline int ptep_test_and_clear_yo
#endif
#ifndef __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
unsigned long address,
pmd_t *pmdp)
@@ -233,7 +233,7 @@ static inline int pmdp_test_and_clear_yo
BUILD_BUG();
return 0;
}
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG */
#endif
#ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH

View File

@ -0,0 +1,224 @@
From a810f8e2f1bdd0707eaf05c8b4ba84a3ff2801bd Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Sun, 27 Sep 2020 20:49:08 -0600
Subject: [PATCH 03/10] mm/vmscan.c: refactor shrink_node()
This patch refactors shrink_node(). This will make the upcoming
changes to mm/vmscan.c more readable.
Signed-off-by: Yu Zhao <yuzhao@google.com>
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
Change-Id: Iae734b5b4030205b7db6e8c841f747b6f6ae1a04
---
mm/vmscan.c | 186 +++++++++++++++++++++++++++-------------------------
1 file changed, 98 insertions(+), 88 deletions(-)
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2562,6 +2562,103 @@ enum scan_balance {
SCAN_FILE,
};
+static void prepare_scan_count(pg_data_t *pgdat, struct scan_control *sc)
+{
+ unsigned long file;
+ struct lruvec *target_lruvec;
+
+ target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
+
+ /*
+ * Determine the scan balance between anon and file LRUs.
+ */
+ spin_lock_irq(&target_lruvec->lru_lock);
+ sc->anon_cost = target_lruvec->anon_cost;
+ sc->file_cost = target_lruvec->file_cost;
+ spin_unlock_irq(&target_lruvec->lru_lock);
+
+ /*
+ * Target desirable inactive:active list ratios for the anon
+ * and file LRU lists.
+ */
+ if (!sc->force_deactivate) {
+ unsigned long refaults;
+
+ refaults = lruvec_page_state(target_lruvec,
+ WORKINGSET_ACTIVATE_ANON);
+ if (refaults != target_lruvec->refaults[0] ||
+ inactive_is_low(target_lruvec, LRU_INACTIVE_ANON))
+ sc->may_deactivate |= DEACTIVATE_ANON;
+ else
+ sc->may_deactivate &= ~DEACTIVATE_ANON;
+
+ /*
+ * When refaults are being observed, it means a new
+ * workingset is being established. Deactivate to get
+ * rid of any stale active pages quickly.
+ */
+ refaults = lruvec_page_state(target_lruvec,
+ WORKINGSET_ACTIVATE_FILE);
+ if (refaults != target_lruvec->refaults[1] ||
+ inactive_is_low(target_lruvec, LRU_INACTIVE_FILE))
+ sc->may_deactivate |= DEACTIVATE_FILE;
+ else
+ sc->may_deactivate &= ~DEACTIVATE_FILE;
+ } else
+ sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE;
+
+ /*
+ * If we have plenty of inactive file pages that aren't
+ * thrashing, try to reclaim those first before touching
+ * anonymous pages.
+ */
+ file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE);
+ if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE))
+ sc->cache_trim_mode = 1;
+ else
+ sc->cache_trim_mode = 0;
+
+ /*
+ * Prevent the reclaimer from falling into the cache trap: as
+ * cache pages start out inactive, every cache fault will tip
+ * the scan balance towards the file LRU. And as the file LRU
+ * shrinks, so does the window for rotation from references.
+ * This means we have a runaway feedback loop where a tiny
+ * thrashing file LRU becomes infinitely more attractive than
+ * anon pages. Try to detect this based on file LRU size.
+ */
+ if (!cgroup_reclaim(sc)) {
+ unsigned long total_high_wmark = 0;
+ unsigned long free, anon;
+ int z;
+
+ free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);
+ file = node_page_state(pgdat, NR_ACTIVE_FILE) +
+ node_page_state(pgdat, NR_INACTIVE_FILE);
+
+ for (z = 0; z < MAX_NR_ZONES; z++) {
+ struct zone *zone = &pgdat->node_zones[z];
+
+ if (!managed_zone(zone))
+ continue;
+
+ total_high_wmark += high_wmark_pages(zone);
+ }
+
+ /*
+ * Consider anon: if that's low too, this isn't a
+ * runaway file reclaim problem, but rather just
+ * extreme pressure. Reclaim as per usual then.
+ */
+ anon = node_page_state(pgdat, NR_INACTIVE_ANON);
+
+ sc->file_is_tiny =
+ file + free <= total_high_wmark &&
+ !(sc->may_deactivate & DEACTIVATE_ANON) &&
+ anon >> sc->priority;
+ }
+}
+
/*
* Determine how aggressively the anon and file LRU lists should be
* scanned. The relative value of each set of LRU lists is determined
@@ -3032,7 +3129,6 @@ static void shrink_node(pg_data_t *pgdat
unsigned long nr_reclaimed, nr_scanned;
struct lruvec *target_lruvec;
bool reclaimable = false;
- unsigned long file;
target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
@@ -3048,93 +3144,7 @@ again:
nr_reclaimed = sc->nr_reclaimed;
nr_scanned = sc->nr_scanned;
- /*
- * Determine the scan balance between anon and file LRUs.
- */
- spin_lock_irq(&target_lruvec->lru_lock);
- sc->anon_cost = target_lruvec->anon_cost;
- sc->file_cost = target_lruvec->file_cost;
- spin_unlock_irq(&target_lruvec->lru_lock);
-
- /*
- * Target desirable inactive:active list ratios for the anon
- * and file LRU lists.
- */
- if (!sc->force_deactivate) {
- unsigned long refaults;
-
- refaults = lruvec_page_state(target_lruvec,
- WORKINGSET_ACTIVATE_ANON);
- if (refaults != target_lruvec->refaults[0] ||
- inactive_is_low(target_lruvec, LRU_INACTIVE_ANON))
- sc->may_deactivate |= DEACTIVATE_ANON;
- else
- sc->may_deactivate &= ~DEACTIVATE_ANON;
-
- /*
- * When refaults are being observed, it means a new
- * workingset is being established. Deactivate to get
- * rid of any stale active pages quickly.
- */
- refaults = lruvec_page_state(target_lruvec,
- WORKINGSET_ACTIVATE_FILE);
- if (refaults != target_lruvec->refaults[1] ||
- inactive_is_low(target_lruvec, LRU_INACTIVE_FILE))
- sc->may_deactivate |= DEACTIVATE_FILE;
- else
- sc->may_deactivate &= ~DEACTIVATE_FILE;
- } else
- sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE;
-
- /*
- * If we have plenty of inactive file pages that aren't
- * thrashing, try to reclaim those first before touching
- * anonymous pages.
- */
- file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE);
- if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE))
- sc->cache_trim_mode = 1;
- else
- sc->cache_trim_mode = 0;
-
- /*
- * Prevent the reclaimer from falling into the cache trap: as
- * cache pages start out inactive, every cache fault will tip
- * the scan balance towards the file LRU. And as the file LRU
- * shrinks, so does the window for rotation from references.
- * This means we have a runaway feedback loop where a tiny
- * thrashing file LRU becomes infinitely more attractive than
- * anon pages. Try to detect this based on file LRU size.
- */
- if (!cgroup_reclaim(sc)) {
- unsigned long total_high_wmark = 0;
- unsigned long free, anon;
- int z;
-
- free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);
- file = node_page_state(pgdat, NR_ACTIVE_FILE) +
- node_page_state(pgdat, NR_INACTIVE_FILE);
-
- for (z = 0; z < MAX_NR_ZONES; z++) {
- struct zone *zone = &pgdat->node_zones[z];
- if (!managed_zone(zone))
- continue;
-
- total_high_wmark += high_wmark_pages(zone);
- }
-
- /*
- * Consider anon: if that's low too, this isn't a
- * runaway file reclaim problem, but rather just
- * extreme pressure. Reclaim as per usual then.
- */
- anon = node_page_state(pgdat, NR_INACTIVE_ANON);
-
- sc->file_is_tiny =
- file + free <= total_high_wmark &&
- !(sc->may_deactivate & DEACTIVATE_ANON) &&
- anon >> sc->priority;
- }
+ prepare_scan_count(pgdat, sc);
shrink_node_memcgs(pgdat, sc);

View File

@ -0,0 +1,996 @@
From 05f366c941ae2bb8ba21c79fafcb747a5a6b967b Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Mon, 25 Jan 2021 21:12:33 -0700
Subject: [PATCH 04/10] mm: multigenerational lru: groundwork
For each lruvec, evictable pages are divided into multiple
generations. The youngest generation number is stored in
lrugen->max_seq for both anon and file types as they are aged on an
equal footing. The oldest generation numbers are stored in
lrugen->min_seq[] separately for anon and file types as clean file
pages can be evicted regardless of swap constraints. These three
variables are monotonically increasing. Generation numbers are
truncated into order_base_2(MAX_NR_GENS+1) bits in order to fit into
page->flags. The sliding window technique is used to prevent truncated
generation numbers from overlapping. Each truncated generation number
is an index to
lrugen->lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES].
The framework comprises two conceptually independent components: the
aging, which produces young generations, and the eviction, which
consumes old generations. Both can be invoked independently from user
space for the purpose of working set estimation and proactive reclaim.
The protection of hot pages and the selection of cold pages are based
on page access types and patterns. There are two access types: one via
page tables and the other via file descriptors. The protection of the
former type is by design stronger because:
1) The uncertainty in determining the access patterns of the former
type is higher due to the coalesced nature of the accessed bit.
2) The cost of evicting the former type is higher due to the TLB
flushes required and the likelihood of involving I/O.
3) The penalty of under-protecting the former type is higher because
applications usually do not prepare themselves for major faults like
they do for blocked I/O. For example, client applications commonly
dedicate blocked I/O to separate threads to avoid UI janks that
negatively affect user experience.
There are also two access patterns: one with temporal locality and the
other without. The latter pattern, e.g., random and sequential, needs
to be explicitly excluded to avoid weakening the protection of the
former pattern. Generally the former type follows the former pattern
unless MADV_SEQUENTIAL is specified and the latter type follows the
latter pattern unless outlying refaults have been observed.
Upon faulting, a page is added to the youngest generation, which
provides the strongest protection as the eviction will not consider
this page before the aging has scanned it at least twice. The first
scan clears the accessed bit set during the initial fault. And the
second scan makes sure this page has not been used since the first
scan. A page from any other generations is brought back to the
youngest generation whenever the aging finds the accessed bit set on
any of the PTEs mapping this page.
Unmapped pages are initially added to the oldest generation and then
conditionally protected by tiers. This is done later [PATCH 07/10].
Signed-off-by: Yu Zhao <yuzhao@google.com>
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
Change-Id: I71de7cd15b8dfa6f9fdd838023474693c4fee0a7
---
fs/fuse/dev.c | 3 +-
include/linux/cgroup.h | 15 +-
include/linux/mm.h | 36 ++++
include/linux/mm_inline.h | 182 ++++++++++++++++++++
include/linux/mmzone.h | 70 ++++++++
include/linux/page-flags-layout.h | 19 ++-
include/linux/page-flags.h | 4 +-
include/linux/sched.h | 3 +
kernel/bounds.c | 3 +
kernel/cgroup/cgroup-internal.h | 1 -
mm/huge_memory.c | 3 +-
mm/memcontrol.c | 1 +
mm/memory.c | 7 +
mm/mm_init.c | 6 +-
mm/page_alloc.c | 1 +
mm/swap.c | 9 +-
mm/swapfile.c | 2 +
mm/vmscan.c | 268 ++++++++++++++++++++++++++++++
18 files changed, 618 insertions(+), 15 deletions(-)
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -785,7 +785,8 @@ static int fuse_check_page(struct page *
1 << PG_active |
1 << PG_workingset |
1 << PG_reclaim |
- 1 << PG_waiters))) {
+ 1 << PG_waiters |
+ LRU_GEN_MASK | LRU_REFS_MASK))) {
dump_page(page, "fuse: trying to steal weird page");
return 1;
}
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -432,6 +432,18 @@ static inline void cgroup_put(struct cgr
css_put(&cgrp->self);
}
+extern struct mutex cgroup_mutex;
+
+static inline void cgroup_lock(void)
+{
+ mutex_lock(&cgroup_mutex);
+}
+
+static inline void cgroup_unlock(void)
+{
+ mutex_unlock(&cgroup_mutex);
+}
+
/**
* task_css_set_check - obtain a task's css_set with extra access conditions
* @task: the task to obtain css_set for
@@ -446,7 +458,6 @@ static inline void cgroup_put(struct cgr
* as locks used during the cgroup_subsys::attach() methods.
*/
#ifdef CONFIG_PROVE_RCU
-extern struct mutex cgroup_mutex;
extern spinlock_t css_set_lock;
#define task_css_set_check(task, __c) \
rcu_dereference_check((task)->cgroups, \
@@ -707,6 +718,8 @@ struct cgroup;
static inline u64 cgroup_id(const struct cgroup *cgrp) { return 1; }
static inline void css_get(struct cgroup_subsys_state *css) {}
static inline void css_put(struct cgroup_subsys_state *css) {}
+static inline void cgroup_lock(void) {}
+static inline void cgroup_unlock(void) {}
static inline int cgroup_attach_task_all(struct task_struct *from,
struct task_struct *t) { return 0; }
static inline int cgroupstats_build(struct cgroupstats *stats,
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1093,6 +1093,8 @@ vm_fault_t finish_mkwrite_fault(struct v
#define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH)
#define LAST_CPUPID_PGOFF (ZONES_PGOFF - LAST_CPUPID_WIDTH)
#define KASAN_TAG_PGOFF (LAST_CPUPID_PGOFF - KASAN_TAG_WIDTH)
+#define LRU_GEN_PGOFF (KASAN_TAG_PGOFF - LRU_GEN_WIDTH)
+#define LRU_REFS_PGOFF (LRU_GEN_PGOFF - LRU_REFS_WIDTH)
/*
* Define the bit shifts to access each section. For non-existent
@@ -1807,6 +1809,40 @@ static inline void unmap_mapping_range(s
loff_t const holebegin, loff_t const holelen, int even_cows) { }
#endif
+#ifdef CONFIG_LRU_GEN
+static inline void task_enter_nonseq_fault(void)
+{
+ WARN_ON(current->in_nonseq_fault);
+
+ current->in_nonseq_fault = 1;
+}
+
+static inline void task_exit_nonseq_fault(void)
+{
+ WARN_ON(!current->in_nonseq_fault);
+
+ current->in_nonseq_fault = 0;
+}
+
+static inline bool task_in_nonseq_fault(void)
+{
+ return current->in_nonseq_fault;
+}
+#else
+static inline void task_enter_nonseq_fault(void)
+{
+}
+
+static inline void task_exit_nonseq_fault(void)
+{
+}
+
+static inline bool task_in_nonseq_fault(void)
+{
+ return false;
+}
+#endif /* CONFIG_LRU_GEN */
+
static inline void unmap_shared_mapping_range(struct address_space *mapping,
loff_t const holebegin, loff_t const holelen)
{
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -79,11 +79,187 @@ static __always_inline enum lru_list pag
return lru;
}
+#ifdef CONFIG_LRU_GEN
+
+static inline bool lru_gen_enabled(void)
+{
+#ifdef CONFIG_LRU_GEN_ENABLED
+ DECLARE_STATIC_KEY_TRUE(lru_gen_static_key);
+
+ return static_branch_likely(&lru_gen_static_key);
+#else
+ DECLARE_STATIC_KEY_FALSE(lru_gen_static_key);
+
+ return static_branch_unlikely(&lru_gen_static_key);
+#endif
+}
+
+/* Return an index within the sliding window that tracks MAX_NR_GENS generations. */
+static inline int lru_gen_from_seq(unsigned long seq)
+{
+ return seq % MAX_NR_GENS;
+}
+
+/* The youngest and the second youngest generations are counted as active. */
+static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen)
+{
+ unsigned long max_seq = lruvec->evictable.max_seq;
+
+ VM_BUG_ON(gen >= MAX_NR_GENS);
+
+ return gen == lru_gen_from_seq(max_seq) || gen == lru_gen_from_seq(max_seq - 1);
+}
+
+/* Update the sizes of the multigenerational lru lists. */
+static inline void lru_gen_update_size(struct page *page, struct lruvec *lruvec,
+ int old_gen, int new_gen)
+{
+ int type = page_is_file_lru(page);
+ int zone = page_zonenum(page);
+ int delta = thp_nr_pages(page);
+ enum lru_list lru = type * LRU_FILE;
+ struct lrugen *lrugen = &lruvec->evictable;
+
+ lockdep_assert_held(&lruvec->lru_lock);
+ VM_BUG_ON(old_gen != -1 && old_gen >= MAX_NR_GENS);
+ VM_BUG_ON(new_gen != -1 && new_gen >= MAX_NR_GENS);
+ VM_BUG_ON(old_gen == -1 && new_gen == -1);
+
+ if (old_gen >= 0)
+ WRITE_ONCE(lrugen->sizes[old_gen][type][zone],
+ lrugen->sizes[old_gen][type][zone] - delta);
+ if (new_gen >= 0)
+ WRITE_ONCE(lrugen->sizes[new_gen][type][zone],
+ lrugen->sizes[new_gen][type][zone] + delta);
+
+ if (old_gen < 0) {
+ if (lru_gen_is_active(lruvec, new_gen))
+ lru += LRU_ACTIVE;
+ update_lru_size(lruvec, lru, zone, delta);
+ return;
+ }
+
+ if (new_gen < 0) {
+ if (lru_gen_is_active(lruvec, old_gen))
+ lru += LRU_ACTIVE;
+ update_lru_size(lruvec, lru, zone, -delta);
+ return;
+ }
+
+ if (!lru_gen_is_active(lruvec, old_gen) && lru_gen_is_active(lruvec, new_gen)) {
+ update_lru_size(lruvec, lru, zone, -delta);
+ update_lru_size(lruvec, lru + LRU_ACTIVE, zone, delta);
+ }
+
+ VM_BUG_ON(lru_gen_is_active(lruvec, old_gen) && !lru_gen_is_active(lruvec, new_gen));
+}
+
+/* Add a page to one of the multigenerational lru lists. Return true on success. */
+static inline bool lru_gen_add_page(struct page *page, struct lruvec *lruvec, bool reclaiming)
+{
+ int gen;
+ unsigned long old_flags, new_flags;
+ int type = page_is_file_lru(page);
+ int zone = page_zonenum(page);
+ struct lrugen *lrugen = &lruvec->evictable;
+
+ if (PageUnevictable(page) || !lrugen->enabled[type])
+ return false;
+ /*
+ * If a page shouldn't be considered for eviction, i.e., a page mapped
+ * upon fault during which the accessed bit is set, add it to the
+ * youngest generation.
+ *
+ * If a page can't be evicted immediately, i.e., an anon page not in
+ * swap cache or a dirty page pending writeback, add it to the second
+ * oldest generation.
+ *
+ * If a page could be evicted immediately, e.g., a clean page, add it to
+ * the oldest generation.
+ */
+ if (PageActive(page))
+ gen = lru_gen_from_seq(lrugen->max_seq);
+ else if ((!type && !PageSwapCache(page)) ||
+ (PageReclaim(page) && (PageDirty(page) || PageWriteback(page))))
+ gen = lru_gen_from_seq(lrugen->min_seq[type] + 1);
+ else
+ gen = lru_gen_from_seq(lrugen->min_seq[type]);
+
+ do {
+ new_flags = old_flags = READ_ONCE(page->flags);
+ VM_BUG_ON_PAGE(new_flags & LRU_GEN_MASK, page);
+
+ new_flags &= ~(LRU_GEN_MASK | BIT(PG_active));
+ new_flags |= (gen + 1UL) << LRU_GEN_PGOFF;
+ } while (cmpxchg(&page->flags, old_flags, new_flags) != old_flags);
+
+ lru_gen_update_size(page, lruvec, -1, gen);
+ /* for rotate_reclaimable_page() */
+ if (reclaiming)
+ list_add_tail(&page->lru, &lrugen->lists[gen][type][zone]);
+ else
+ list_add(&page->lru, &lrugen->lists[gen][type][zone]);
+
+ return true;
+}
+
+/* Delete a page from one of the multigenerational lru lists. Return true on success. */
+static inline bool lru_gen_del_page(struct page *page, struct lruvec *lruvec, bool reclaiming)
+{
+ int gen;
+ unsigned long old_flags, new_flags;
+
+ do {
+ new_flags = old_flags = READ_ONCE(page->flags);
+ if (!(new_flags & LRU_GEN_MASK))
+ return false;
+
+ VM_BUG_ON_PAGE(PageActive(page), page);
+ VM_BUG_ON_PAGE(PageUnevictable(page), page);
+
+ gen = ((new_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
+
+ new_flags &= ~LRU_GEN_MASK;
+ /* for shrink_page_list() */
+ if (reclaiming)
+ new_flags &= ~(BIT(PG_referenced) | BIT(PG_reclaim));
+ else if (lru_gen_is_active(lruvec, gen))
+ new_flags |= BIT(PG_active);
+ } while (cmpxchg(&page->flags, old_flags, new_flags) != old_flags);
+
+ lru_gen_update_size(page, lruvec, gen, -1);
+ list_del(&page->lru);
+
+ return true;
+}
+
+#else
+
+static inline bool lru_gen_enabled(void)
+{
+ return false;
+}
+
+static inline bool lru_gen_add_page(struct page *page, struct lruvec *lruvec, bool reclaiming)
+{
+ return false;
+}
+
+static inline bool lru_gen_del_page(struct page *page, struct lruvec *lruvec, bool reclaiming)
+{
+ return false;
+}
+
+#endif /* CONFIG_LRU_GEN */
+
static __always_inline void add_page_to_lru_list(struct page *page,
struct lruvec *lruvec)
{
enum lru_list lru = page_lru(page);
+ if (lru_gen_add_page(page, lruvec, false))
+ return;
+
update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page));
list_add(&page->lru, &lruvec->lists[lru]);
}
@@ -93,6 +269,9 @@ static __always_inline void add_page_to_
{
enum lru_list lru = page_lru(page);
+ if (lru_gen_add_page(page, lruvec, true))
+ return;
+
update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page));
list_add_tail(&page->lru, &lruvec->lists[lru]);
}
@@ -100,6 +279,9 @@ static __always_inline void add_page_to_
static __always_inline void del_page_from_lru_list(struct page *page,
struct lruvec *lruvec)
{
+ if (lru_gen_del_page(page, lruvec, false))
+ return;
+
list_del(&page->lru);
update_lru_size(lruvec, page_lru(page), page_zonenum(page),
-thp_nr_pages(page));
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -294,6 +294,72 @@ enum lruvec_flags {
*/
};
+struct lruvec;
+
+#define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF)
+#define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF)
+
+#ifdef CONFIG_LRU_GEN
+
+/*
+ * For each lruvec, evictable pages are divided into multiple generations. The
+ * youngest and the oldest generation numbers, AKA max_seq and min_seq, are
+ * monotonically increasing. The sliding window technique is used to track at
+ * least MIN_NR_GENS and at most MAX_NR_GENS generations. An offset within the
+ * window, AKA gen, indexes an array of per-type and per-zone lists for the
+ * corresponding generation. The counter in page->flags stores gen+1 while a
+ * page is on one of the multigenerational lru lists. Otherwise, it stores 0.
+ *
+ * After a page is faulted in, the aging must check the accessed bit at least
+ * twice before the eviction would consider it. The first check clears the
+ * accessed bit set during the initial fault. The second check makes sure this
+ * page hasn't been used since then.
+ */
+#define MIN_NR_GENS 2
+#define MAX_NR_GENS ((unsigned int)CONFIG_NR_LRU_GENS)
+
+struct lrugen {
+ /* the aging increments the max generation number */
+ unsigned long max_seq;
+ /* the eviction increments the min generation numbers */
+ unsigned long min_seq[ANON_AND_FILE];
+ /* the birth time of each generation in jiffies */
+ unsigned long timestamps[MAX_NR_GENS];
+ /* the multigenerational lru lists */
+ struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
+ /* the sizes of the multigenerational lru lists in pages */
+ unsigned long sizes[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
+ /* whether the multigenerational lru is enabled */
+ bool enabled[ANON_AND_FILE];
+};
+
+#define MAX_BATCH_SIZE 8192
+
+void lru_gen_init_state(struct mem_cgroup *memcg, struct lruvec *lruvec);
+void lru_gen_change_state(bool enable, bool main, bool swap);
+
+#ifdef CONFIG_MEMCG
+void lru_gen_init_memcg(struct mem_cgroup *memcg);
+#endif
+
+#else /* !CONFIG_LRU_GEN */
+
+static inline void lru_gen_init_state(struct mem_cgroup *memcg, struct lruvec *lruvec)
+{
+}
+
+static inline void lru_gen_change_state(bool enable, bool main, bool swap)
+{
+}
+
+#ifdef CONFIG_MEMCG
+static inline void lru_gen_init_memcg(struct mem_cgroup *memcg)
+{
+}
+#endif
+
+#endif /* CONFIG_LRU_GEN */
+
struct lruvec {
struct list_head lists[NR_LRU_LISTS];
/* per lruvec lru_lock for memcg */
@@ -311,6 +377,10 @@ struct lruvec {
unsigned long refaults[ANON_AND_FILE];
/* Various lruvec state flags (enum lruvec_flags) */
unsigned long flags;
+#ifdef CONFIG_LRU_GEN
+ /* unevictable pages are on LRU_UNEVICTABLE */
+ struct lrugen evictable;
+#endif
#ifdef CONFIG_MEMCG
struct pglist_data *pgdat;
#endif
--- a/include/linux/page-flags-layout.h
+++ b/include/linux/page-flags-layout.h
@@ -26,6 +26,14 @@
#define ZONES_WIDTH ZONES_SHIFT
+#ifdef CONFIG_LRU_GEN
+/* LRU_GEN_WIDTH is generated from order_base_2(CONFIG_NR_LRU_GENS + 1). */
+#define LRU_REFS_WIDTH (CONFIG_TIERS_PER_GEN - 2)
+#else
+#define LRU_GEN_WIDTH 0
+#define LRU_REFS_WIDTH 0
+#endif /* CONFIG_LRU_GEN */
+
#ifdef CONFIG_SPARSEMEM
#include <asm/sparsemem.h>
#define SECTIONS_SHIFT (MAX_PHYSMEM_BITS - SECTION_SIZE_BITS)
@@ -55,7 +63,8 @@
#define SECTIONS_WIDTH 0
#endif
-#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS
+#if ZONES_WIDTH + LRU_GEN_WIDTH + LRU_REFS_WIDTH + SECTIONS_WIDTH + NODES_SHIFT \
+ <= BITS_PER_LONG - NR_PAGEFLAGS
#define NODES_WIDTH NODES_SHIFT
#elif defined(CONFIG_SPARSEMEM_VMEMMAP)
#error "Vmemmap: No space for nodes field in page flags"
@@ -89,8 +98,8 @@
#define LAST_CPUPID_SHIFT 0
#endif
-#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + KASAN_TAG_WIDTH + LAST_CPUPID_SHIFT \
- <= BITS_PER_LONG - NR_PAGEFLAGS
+#if ZONES_WIDTH + LRU_GEN_WIDTH + LRU_REFS_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + \
+ KASAN_TAG_WIDTH + LAST_CPUPID_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS
#define LAST_CPUPID_WIDTH LAST_CPUPID_SHIFT
#else
#define LAST_CPUPID_WIDTH 0
@@ -100,8 +109,8 @@
#define LAST_CPUPID_NOT_IN_PAGE_FLAGS
#endif
-#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + KASAN_TAG_WIDTH + LAST_CPUPID_WIDTH \
- > BITS_PER_LONG - NR_PAGEFLAGS
+#if ZONES_WIDTH + LRU_GEN_WIDTH + LRU_REFS_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + \
+ KASAN_TAG_WIDTH + LAST_CPUPID_WIDTH > BITS_PER_LONG - NR_PAGEFLAGS
#error "Not enough bits in page flags"
#endif
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -845,7 +845,7 @@ static inline void ClearPageSlabPfmemall
1UL << PG_private | 1UL << PG_private_2 | \
1UL << PG_writeback | 1UL << PG_reserved | \
1UL << PG_slab | 1UL << PG_active | \
- 1UL << PG_unevictable | __PG_MLOCKED)
+ 1UL << PG_unevictable | __PG_MLOCKED | LRU_GEN_MASK)
/*
* Flags checked when a page is prepped for return by the page allocator.
@@ -856,7 +856,7 @@ static inline void ClearPageSlabPfmemall
* alloc-free cycle to prevent from reusing the page.
*/
#define PAGE_FLAGS_CHECK_AT_PREP \
- (PAGEFLAGS_MASK & ~__PG_HWPOISON)
+ ((PAGEFLAGS_MASK & ~__PG_HWPOISON) | LRU_GEN_MASK | LRU_REFS_MASK)
#define PAGE_FLAGS_PRIVATE \
(1UL << PG_private | 1UL << PG_private_2)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -911,6 +911,9 @@ struct task_struct {
#ifdef CONFIG_MEMCG
unsigned in_user_fault:1;
#endif
+#ifdef CONFIG_LRU_GEN
+ unsigned in_nonseq_fault:1;
+#endif
#ifdef CONFIG_COMPAT_BRK
unsigned brk_randomized:1;
#endif
--- a/kernel/bounds.c
+++ b/kernel/bounds.c
@@ -22,6 +22,9 @@ int main(void)
DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS));
#endif
DEFINE(SPINLOCK_SIZE, sizeof(spinlock_t));
+#ifdef CONFIG_LRU_GEN
+ DEFINE(LRU_GEN_WIDTH, order_base_2(CONFIG_NR_LRU_GENS + 1));
+#endif
/* End of constants */
return 0;
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -165,7 +165,6 @@ struct cgroup_mgctx {
#define DEFINE_CGROUP_MGCTX(name) \
struct cgroup_mgctx name = CGROUP_MGCTX_INIT(name)
-extern struct mutex cgroup_mutex;
extern spinlock_t css_set_lock;
extern struct cgroup_subsys *cgroup_subsys[];
extern struct list_head cgroup_roots;
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2364,7 +2364,8 @@ static void __split_huge_page_tail(struc
#ifdef CONFIG_64BIT
(1L << PG_arch_2) |
#endif
- (1L << PG_dirty)));
+ (1L << PG_dirty) |
+ LRU_GEN_MASK | LRU_REFS_MASK));
/* ->mapping in first tail page is compound_mapcount */
VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5226,6 +5226,7 @@ static struct mem_cgroup *mem_cgroup_all
memcg->deferred_split_queue.split_queue_len = 0;
#endif
idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
+ lru_gen_init_memcg(memcg);
return memcg;
fail:
mem_cgroup_id_remove(memcg);
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4788,6 +4788,7 @@ vm_fault_t handle_mm_fault(struct vm_are
unsigned int flags, struct pt_regs *regs)
{
vm_fault_t ret;
+ bool nonseq_fault = !(vma->vm_flags & VM_SEQ_READ);
__set_current_state(TASK_RUNNING);
@@ -4809,11 +4810,17 @@ vm_fault_t handle_mm_fault(struct vm_are
if (flags & FAULT_FLAG_USER)
mem_cgroup_enter_user_fault();
+ if (nonseq_fault)
+ task_enter_nonseq_fault();
+
if (unlikely(is_vm_hugetlb_page(vma)))
ret = hugetlb_fault(vma->vm_mm, vma, address, flags);
else
ret = __handle_mm_fault(vma, address, flags);
+ if (nonseq_fault)
+ task_exit_nonseq_fault();
+
if (flags & FAULT_FLAG_USER) {
mem_cgroup_exit_user_fault();
/*
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -65,14 +65,16 @@ void __init mminit_verify_pageflags_layo
shift = 8 * sizeof(unsigned long);
width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH
- - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH;
+ - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH - LRU_GEN_WIDTH - LRU_REFS_WIDTH;
mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths",
- "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Flags %d\n",
+ "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Gen %d Tier %d Flags %d\n",
SECTIONS_WIDTH,
NODES_WIDTH,
ZONES_WIDTH,
LAST_CPUPID_WIDTH,
KASAN_TAG_WIDTH,
+ LRU_GEN_WIDTH,
+ LRU_REFS_WIDTH,
NR_PAGEFLAGS);
mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts",
"Section %d Node %d Zone %d Lastcpupid %d Kasantag %d\n",
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -7411,6 +7411,7 @@ static void __meminit pgdat_init_interna
pgdat_page_ext_init(pgdat);
lruvec_init(&pgdat->__lruvec);
+ lru_gen_init_state(NULL, &pgdat->__lruvec);
}
static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, int nid,
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -446,6 +446,11 @@ void lru_cache_add(struct page *page)
VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page);
VM_BUG_ON_PAGE(PageLRU(page), page);
+ /* see the comment in lru_gen_add_page() */
+ if (lru_gen_enabled() && !PageUnevictable(page) &&
+ task_in_nonseq_fault() && !(current->flags & PF_MEMALLOC))
+ SetPageActive(page);
+
get_page(page);
local_lock(&lru_pvecs.lock);
pvec = this_cpu_ptr(&lru_pvecs.lru_add);
@@ -547,7 +552,7 @@ static void lru_deactivate_file_fn(struc
static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec)
{
- if (PageActive(page) && !PageUnevictable(page)) {
+ if (!PageUnevictable(page) && (PageActive(page) || lru_gen_enabled())) {
int nr_pages = thp_nr_pages(page);
del_page_from_lru_list(page, lruvec);
@@ -661,7 +666,7 @@ void deactivate_file_page(struct page *p
*/
void deactivate_page(struct page *page)
{
- if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
+ if (PageLRU(page) && !PageUnevictable(page) && (PageActive(page) || lru_gen_enabled())) {
struct pagevec *pvec;
local_lock(&lru_pvecs.lock);
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -2688,6 +2688,7 @@ SYSCALL_DEFINE1(swapoff, const char __us
err = 0;
atomic_inc(&proc_poll_event);
wake_up_interruptible(&proc_poll_wait);
+ lru_gen_change_state(false, false, true);
out_dput:
filp_close(victim, NULL);
@@ -3349,6 +3350,7 @@ SYSCALL_DEFINE2(swapon, const char __use
mutex_unlock(&swapon_mutex);
atomic_inc(&proc_poll_event);
wake_up_interruptible(&proc_poll_wait);
+ lru_gen_change_state(true, false, true);
error = 0;
goto out;
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -50,6 +50,7 @@
#include <linux/printk.h>
#include <linux/dax.h>
#include <linux/psi.h>
+#include <linux/memory.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
@@ -2880,6 +2881,273 @@ static bool can_age_anon_pages(struct pg
return can_demote(pgdat->node_id, sc);
}
+#ifdef CONFIG_LRU_GEN
+
+/******************************************************************************
+ * shorthand helpers
+ ******************************************************************************/
+
+#define for_each_gen_type_zone(gen, type, zone) \
+ for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++) \
+ for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \
+ for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++)
+
+static int page_lru_gen(struct page *page)
+{
+ unsigned long flags = READ_ONCE(page->flags);
+
+ return ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
+}
+
+static struct lruvec *get_lruvec(int nid, struct mem_cgroup *memcg)
+{
+ struct pglist_data *pgdat = NODE_DATA(nid);
+
+#ifdef CONFIG_MEMCG
+ if (memcg) {
+ struct lruvec *lruvec = &memcg->nodeinfo[nid]->lruvec;
+
+ if (lruvec->pgdat != pgdat)
+ lruvec->pgdat = pgdat;
+
+ return lruvec;
+ }
+#endif
+ return pgdat ? &pgdat->__lruvec : NULL;
+}
+
+static int get_nr_gens(struct lruvec *lruvec, int type)
+{
+ return lruvec->evictable.max_seq - lruvec->evictable.min_seq[type] + 1;
+}
+
+static bool __maybe_unused seq_is_valid(struct lruvec *lruvec)
+{
+ return get_nr_gens(lruvec, 1) >= MIN_NR_GENS &&
+ get_nr_gens(lruvec, 1) <= get_nr_gens(lruvec, 0) &&
+ get_nr_gens(lruvec, 0) <= MAX_NR_GENS;
+}
+
+/******************************************************************************
+ * state change
+ ******************************************************************************/
+
+#ifdef CONFIG_LRU_GEN_ENABLED
+DEFINE_STATIC_KEY_TRUE(lru_gen_static_key);
+#else
+DEFINE_STATIC_KEY_FALSE(lru_gen_static_key);
+#endif
+
+static int lru_gen_nr_swapfiles;
+
+static bool __maybe_unused state_is_valid(struct lruvec *lruvec)
+{
+ int gen, type, zone;
+ enum lru_list lru;
+ struct lrugen *lrugen = &lruvec->evictable;
+
+ for_each_evictable_lru(lru) {
+ type = is_file_lru(lru);
+
+ if (lrugen->enabled[type] && !list_empty(&lruvec->lists[lru]))
+ return false;
+ }
+
+ for_each_gen_type_zone(gen, type, zone) {
+ if (!lrugen->enabled[type] && !list_empty(&lrugen->lists[gen][type][zone]))
+ return false;
+
+ /* unlikely but not a bug when reset_batch_size() is pending */
+ VM_WARN_ON(!lrugen->enabled[type] && lrugen->sizes[gen][type][zone]);
+ }
+
+ return true;
+}
+
+static bool fill_lists(struct lruvec *lruvec)
+{
+ enum lru_list lru;
+ int remaining = MAX_BATCH_SIZE;
+
+ for_each_evictable_lru(lru) {
+ int type = is_file_lru(lru);
+ bool active = is_active_lru(lru);
+ struct list_head *head = &lruvec->lists[lru];
+
+ if (!lruvec->evictable.enabled[type])
+ continue;
+
+ while (!list_empty(head)) {
+ bool success;
+ struct page *page = lru_to_page(head);
+
+ VM_BUG_ON_PAGE(PageTail(page), page);
+ VM_BUG_ON_PAGE(PageUnevictable(page), page);
+ VM_BUG_ON_PAGE(PageActive(page) != active, page);
+ VM_BUG_ON_PAGE(page_is_file_lru(page) != type, page);
+ VM_BUG_ON_PAGE(page_lru_gen(page) < MAX_NR_GENS, page);
+
+ prefetchw_prev_lru_page(page, head, flags);
+
+ del_page_from_lru_list(page, lruvec);
+ success = lru_gen_add_page(page, lruvec, false);
+ VM_BUG_ON(!success);
+
+ if (!--remaining)
+ return false;
+ }
+ }
+
+ return true;
+}
+
+static bool drain_lists(struct lruvec *lruvec)
+{
+ int gen, type, zone;
+ int remaining = MAX_BATCH_SIZE;
+
+ for_each_gen_type_zone(gen, type, zone) {
+ struct list_head *head = &lruvec->evictable.lists[gen][type][zone];
+
+ if (lruvec->evictable.enabled[type])
+ continue;
+
+ while (!list_empty(head)) {
+ bool success;
+ struct page *page = lru_to_page(head);
+
+ VM_BUG_ON_PAGE(PageTail(page), page);
+ VM_BUG_ON_PAGE(PageUnevictable(page), page);
+ VM_BUG_ON_PAGE(PageActive(page), page);
+ VM_BUG_ON_PAGE(page_is_file_lru(page) != type, page);
+ VM_BUG_ON_PAGE(page_zonenum(page) != zone, page);
+
+ prefetchw_prev_lru_page(page, head, flags);
+
+ success = lru_gen_del_page(page, lruvec, false);
+ VM_BUG_ON(!success);
+ add_page_to_lru_list(page, lruvec);
+
+ if (!--remaining)
+ return false;
+ }
+ }
+
+ return true;
+}
+
+/*
+ * For file page tracking, we enable/disable it according to the main switch.
+ * For anon page tracking, we only enabled it when the main switch is on and
+ * there is at least one swapfile; we disable it when there are no swapfiles
+ * regardless of the value of the main switch. Otherwise, we will eventually
+ * reach the max size of the sliding window and have to call inc_min_seq().
+ */
+void lru_gen_change_state(bool enable, bool main, bool swap)
+{
+ static DEFINE_MUTEX(state_mutex);
+
+ struct mem_cgroup *memcg;
+
+ mem_hotplug_begin();
+ cgroup_lock();
+ mutex_lock(&state_mutex);
+
+ if (swap) {
+ if (enable)
+ swap = !lru_gen_nr_swapfiles++;
+ else
+ swap = !--lru_gen_nr_swapfiles;
+ }
+
+ if (main && enable != lru_gen_enabled()) {
+ if (enable)
+ static_branch_enable(&lru_gen_static_key);
+ else
+ static_branch_disable(&lru_gen_static_key);
+ } else if (!swap || !lru_gen_enabled())
+ goto unlock;
+
+ memcg = mem_cgroup_iter(NULL, NULL, NULL);
+ do {
+ int nid;
+
+ for_each_node(nid) {
+ struct lruvec *lruvec = get_lruvec(nid, memcg);
+
+ if (!lruvec)
+ continue;
+
+ spin_lock_irq(&lruvec->lru_lock);
+
+ VM_BUG_ON(!seq_is_valid(lruvec));
+ VM_BUG_ON(!state_is_valid(lruvec));
+
+ lruvec->evictable.enabled[0] = lru_gen_enabled() && lru_gen_nr_swapfiles;
+ lruvec->evictable.enabled[1] = lru_gen_enabled();
+
+ while (!(enable ? fill_lists(lruvec) : drain_lists(lruvec))) {
+ spin_unlock_irq(&lruvec->lru_lock);
+ cond_resched();
+ spin_lock_irq(&lruvec->lru_lock);
+ }
+
+ spin_unlock_irq(&lruvec->lru_lock);
+ }
+
+ cond_resched();
+ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
+unlock:
+ mutex_unlock(&state_mutex);
+ cgroup_unlock();
+ mem_hotplug_done();
+}
+
+/******************************************************************************
+ * initialization
+ ******************************************************************************/
+
+void lru_gen_init_state(struct mem_cgroup *memcg, struct lruvec *lruvec)
+{
+ int i;
+ int gen, type, zone;
+ struct lrugen *lrugen = &lruvec->evictable;
+
+ lrugen->max_seq = MIN_NR_GENS + 1;
+ lrugen->enabled[0] = lru_gen_enabled() && lru_gen_nr_swapfiles;
+ lrugen->enabled[1] = lru_gen_enabled();
+
+ for (i = 0; i <= MIN_NR_GENS + 1; i++)
+ lrugen->timestamps[i] = jiffies;
+
+ for_each_gen_type_zone(gen, type, zone)
+ INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]);
+}
+
+#ifdef CONFIG_MEMCG
+void lru_gen_init_memcg(struct mem_cgroup *memcg)
+{
+ int nid;
+
+ for_each_node(nid) {
+ struct lruvec *lruvec = get_lruvec(nid, memcg);
+
+ lru_gen_init_state(memcg, lruvec);
+ }
+}
+#endif
+
+static int __init init_lru_gen(void)
+{
+ BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS);
+ BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS);
+
+ return 0;
+};
+late_initcall(init_lru_gen);
+
+#endif /* CONFIG_LRU_GEN */
+
static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
{
unsigned long nr[NR_LRU_LISTS];

View File

@ -0,0 +1,760 @@
From 534bcc4a0bb5b24600891ce793f0295a142e9dae Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Mon, 5 Apr 2021 04:17:41 -0600
Subject: [PATCH 05/10] mm: multigenerational lru: mm_struct list
To scan PTEs for accessed pages, a mm_struct list is maintained for
each memcg. When multiple threads traverse the same memcg->mm_list,
each of them gets a unique mm_struct and therefore they can run
walk_page_range() concurrently to reach page tables of all processes
of this memcg.
This infrastructure also provides the following optimizations:
1) it allows walkers to skip processes that have been sleeping since
the last walk by tracking the usage of mm_struct between context
switches.
2) it allows walkers to add interesting items they find during a
walk to a Bloom filter so that they can skip uninteresting items
during the next walk by testing whether an item is in this Bloom
filter.
Signed-off-by: Yu Zhao <yuzhao@google.com>
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
Change-Id: I25d9eda8c6bdc7c3653b9f210a159d6c247c81e8
---
fs/exec.c | 2 +
include/linux/memcontrol.h | 4 +
include/linux/mm_inline.h | 6 +
include/linux/mm_types.h | 75 +++++++++
include/linux/mmzone.h | 63 +++++++
kernel/exit.c | 1 +
kernel/fork.c | 9 +
kernel/sched/core.c | 1 +
mm/memcontrol.c | 25 +++
mm/vmscan.c | 331 +++++++++++++++++++++++++++++++++++++
10 files changed, 517 insertions(+)
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1013,6 +1013,7 @@ static int exec_mmap(struct mm_struct *m
active_mm = tsk->active_mm;
tsk->active_mm = mm;
tsk->mm = mm;
+ lru_gen_add_mm(mm);
/*
* This prevents preemption while active_mm is being loaded and
* it and mm are being updated, which could cause problems for
@@ -1023,6 +1024,7 @@ static int exec_mmap(struct mm_struct *m
if (!IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM))
local_irq_enable();
activate_mm(active_mm, mm);
+ lru_gen_activate_mm(mm);
if (IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM))
local_irq_enable();
tsk->mm->vmacache_seqnum = 0;
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -348,6 +348,10 @@ struct mem_cgroup {
struct deferred_split deferred_split_queue;
#endif
+#ifdef CONFIG_LRU_GEN
+ struct lru_gen_mm_list mm_list;
+#endif
+
struct mem_cgroup_per_node *nodeinfo[];
};
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -100,6 +100,12 @@ static inline int lru_gen_from_seq(unsig
return seq % MAX_NR_GENS;
}
+/* Return a proper index regardless whether we keep stats for historical generations. */
+static inline int lru_hist_from_seq(unsigned long seq)
+{
+ return seq % NR_HIST_GENS;
+}
+
/* The youngest and the second youngest generations are counted as active. */
static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen)
{
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -3,6 +3,7 @@
#define _LINUX_MM_TYPES_H
#include <linux/mm_types_task.h>
+#include <linux/sched.h>
#include <linux/auxvec.h>
#include <linux/list.h>
@@ -15,6 +16,8 @@
#include <linux/page-flags-layout.h>
#include <linux/workqueue.h>
#include <linux/seqlock.h>
+#include <linux/nodemask.h>
+#include <linux/mmdebug.h>
#include <asm/mmu.h>
@@ -580,6 +583,18 @@ struct mm_struct {
#ifdef CONFIG_IOMMU_SUPPORT
u32 pasid;
#endif
+#ifdef CONFIG_LRU_GEN
+ struct {
+ /* the node of a global or per-memcg mm_struct list */
+ struct list_head list;
+#ifdef CONFIG_MEMCG
+ /* points to the memcg of the owner task above */
+ struct mem_cgroup *memcg;
+#endif
+ /* whether this mm_struct has been used since the last walk */
+ nodemask_t nodes;
+ } lrugen;
+#endif /* CONFIG_LRU_GEN */
} __randomize_layout;
/*
@@ -606,6 +621,66 @@ static inline cpumask_t *mm_cpumask(stru
return (struct cpumask *)&mm->cpu_bitmap;
}
+#ifdef CONFIG_LRU_GEN
+
+struct lru_gen_mm_list {
+ /* a global or per-memcg mm_struct list */
+ struct list_head fifo;
+ /* protects the list above */
+ spinlock_t lock;
+};
+
+void lru_gen_add_mm(struct mm_struct *mm);
+void lru_gen_del_mm(struct mm_struct *mm);
+#ifdef CONFIG_MEMCG
+void lru_gen_migrate_mm(struct mm_struct *mm);
+#endif
+
+static inline void lru_gen_init_mm(struct mm_struct *mm)
+{
+ INIT_LIST_HEAD(&mm->lrugen.list);
+#ifdef CONFIG_MEMCG
+ mm->lrugen.memcg = NULL;
+#endif
+ nodes_clear(mm->lrugen.nodes);
+}
+
+/* Track the usage of each mm_struct so that we can skip inactive ones. */
+static inline void lru_gen_activate_mm(struct mm_struct *mm)
+{
+ /* unlikely but not a bug when racing with lru_gen_migrate_mm() */
+ VM_WARN_ON(list_empty(&mm->lrugen.list));
+
+ if (!(current->flags & PF_KTHREAD) && !nodes_full(mm->lrugen.nodes))
+ nodes_setall(mm->lrugen.nodes);
+}
+
+#else /* !CONFIG_LRU_GEN */
+
+static inline void lru_gen_add_mm(struct mm_struct *mm)
+{
+}
+
+static inline void lru_gen_del_mm(struct mm_struct *mm)
+{
+}
+
+#ifdef CONFIG_MEMCG
+static inline void lru_gen_migrate_mm(struct mm_struct *mm)
+{
+}
+#endif
+
+static inline void lru_gen_init_mm(struct mm_struct *mm)
+{
+}
+
+static inline void lru_gen_activate_mm(struct mm_struct *mm)
+{
+}
+
+#endif /* CONFIG_LRU_GEN */
+
struct mmu_gather;
extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm);
extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm);
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -318,6 +318,13 @@ struct lruvec;
#define MIN_NR_GENS 2
#define MAX_NR_GENS ((unsigned int)CONFIG_NR_LRU_GENS)
+/* Whether to keep stats for historical generations. */
+#ifdef CONFIG_LRU_GEN_STATS
+#define NR_HIST_GENS ((unsigned int)CONFIG_NR_LRU_GENS)
+#else
+#define NR_HIST_GENS 1U
+#endif
+
struct lrugen {
/* the aging increments the max generation number */
unsigned long max_seq;
@@ -333,13 +340,63 @@ struct lrugen {
bool enabled[ANON_AND_FILE];
};
+enum {
+ MM_LEAF_TOTAL, /* total leaf entries */
+ MM_LEAF_OLD, /* old leaf entries */
+ MM_LEAF_YOUNG, /* young leaf entries */
+ MM_NONLEAF_TOTAL, /* total non-leaf entries */
+ MM_NONLEAF_PREV, /* previously worthy non-leaf entries */
+ MM_NONLEAF_CUR, /* currently worthy non-leaf entries */
+ NR_MM_STATS
+};
+
+/* mnemonic codes for the stats above */
+#define MM_STAT_CODES "toydpc"
+
+/* double buffering bloom filters */
+#define NR_BLOOM_FILTERS 2
+
+struct lru_gen_mm_walk {
+ /* set to max_seq after each round of walk */
+ unsigned long seq;
+ /* the next mm_struct on the list to walk */
+ struct list_head *head;
+ /* the first mm_struct never walked before */
+ struct list_head *tail;
+ /* to wait for the last walker to finish */
+ struct wait_queue_head wait;
+ /* bloom filters flip after each round of walk */
+ unsigned long *filters[NR_BLOOM_FILTERS];
+ /* page table stats for debugging */
+ unsigned long stats[NR_HIST_GENS][NR_MM_STATS];
+ /* the number of concurrent walkers */
+ int nr_walkers;
+};
+
+#define MIN_BATCH_SIZE 64
#define MAX_BATCH_SIZE 8192
+struct mm_walk_args {
+ struct mem_cgroup *memcg;
+ unsigned long max_seq;
+ unsigned long start_pfn;
+ unsigned long end_pfn;
+ unsigned long next_addr;
+ unsigned long bitmap[BITS_TO_LONGS(MIN_BATCH_SIZE)];
+ int node_id;
+ int swappiness;
+ int batch_size;
+ int nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
+ int mm_stats[NR_MM_STATS];
+ bool use_filter;
+};
+
void lru_gen_init_state(struct mem_cgroup *memcg, struct lruvec *lruvec);
void lru_gen_change_state(bool enable, bool main, bool swap);
#ifdef CONFIG_MEMCG
void lru_gen_init_memcg(struct mem_cgroup *memcg);
+void lru_gen_free_memcg(struct mem_cgroup *memcg);
#endif
#else /* !CONFIG_LRU_GEN */
@@ -356,6 +413,10 @@ static inline void lru_gen_change_state(
static inline void lru_gen_init_memcg(struct mem_cgroup *memcg)
{
}
+
+static inline void lru_gen_free_memcg(struct mem_cgroup *memcg)
+{
+}
#endif
#endif /* CONFIG_LRU_GEN */
@@ -380,6 +441,8 @@ struct lruvec {
#ifdef CONFIG_LRU_GEN
/* unevictable pages are on LRU_UNEVICTABLE */
struct lrugen evictable;
+ /* state for mm list and page table walks */
+ struct lru_gen_mm_walk mm_walk;
#endif
#ifdef CONFIG_MEMCG
struct pglist_data *pgdat;
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -422,6 +422,7 @@ assign_new_owner:
goto retry;
}
WRITE_ONCE(mm->owner, c);
+ lru_gen_migrate_mm(mm);
task_unlock(c);
put_task_struct(c);
}
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1080,6 +1080,7 @@ static struct mm_struct *mm_init(struct
goto fail_nocontext;
mm->user_ns = get_user_ns(user_ns);
+ lru_gen_init_mm(mm);
return mm;
fail_nocontext:
@@ -1122,6 +1123,7 @@ static inline void __mmput(struct mm_str
}
if (mm->binfmt)
module_put(mm->binfmt->module);
+ lru_gen_del_mm(mm);
mmdrop(mm);
}
@@ -2616,6 +2618,13 @@ pid_t kernel_clone(struct kernel_clone_a
get_task_struct(p);
}
+ if (IS_ENABLED(CONFIG_LRU_GEN) && !(clone_flags & CLONE_VM)) {
+ /* lock the task to synchronize with memcg migration */
+ task_lock(p);
+ lru_gen_add_mm(p->mm);
+ task_unlock(p);
+ }
+
wake_up_new_task(p);
/* forking complete and child started to run, tell ptracer */
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4978,6 +4978,7 @@ context_switch(struct rq *rq, struct tas
* finish_task_switch()'s mmdrop().
*/
switch_mm_irqs_off(prev->active_mm, next->mm, next);
+ lru_gen_activate_mm(next->mm);
if (!prev->mm) { // from kernel
/* will mmdrop() in finish_task_switch(). */
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5163,6 +5163,7 @@ static void __mem_cgroup_free(struct mem
static void mem_cgroup_free(struct mem_cgroup *memcg)
{
+ lru_gen_free_memcg(memcg);
memcg_wb_domain_exit(memcg);
__mem_cgroup_free(memcg);
}
@@ -6195,6 +6196,29 @@ static void mem_cgroup_move_task(void)
}
#endif
+#ifdef CONFIG_LRU_GEN
+static void mem_cgroup_attach(struct cgroup_taskset *tset)
+{
+ struct cgroup_subsys_state *css;
+ struct task_struct *task = NULL;
+
+ cgroup_taskset_for_each_leader(task, css, tset)
+ break;
+
+ if (!task)
+ return;
+
+ task_lock(task);
+ if (task->mm && task->mm->owner == task)
+ lru_gen_migrate_mm(task->mm);
+ task_unlock(task);
+}
+#else
+static void mem_cgroup_attach(struct cgroup_taskset *tset)
+{
+}
+#endif /* CONFIG_LRU_GEN */
+
static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value)
{
if (value == PAGE_COUNTER_MAX)
@@ -6538,6 +6562,7 @@ struct cgroup_subsys memory_cgrp_subsys
.css_reset = mem_cgroup_css_reset,
.css_rstat_flush = mem_cgroup_css_rstat_flush,
.can_attach = mem_cgroup_can_attach,
+ .attach = mem_cgroup_attach,
.cancel_attach = mem_cgroup_cancel_attach,
.post_attach = mem_cgroup_move_task,
.dfl_cftypes = memory_files,
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2929,6 +2929,306 @@ static bool __maybe_unused seq_is_valid(
}
/******************************************************************************
+ * mm_struct list
+ ******************************************************************************/
+
+static struct lru_gen_mm_list *get_mm_list(struct mem_cgroup *memcg)
+{
+ static struct lru_gen_mm_list mm_list = {
+ .fifo = LIST_HEAD_INIT(mm_list.fifo),
+ .lock = __SPIN_LOCK_UNLOCKED(mm_list.lock),
+ };
+
+#ifdef CONFIG_MEMCG
+ if (memcg)
+ return &memcg->mm_list;
+#endif
+ return &mm_list;
+}
+
+void lru_gen_add_mm(struct mm_struct *mm)
+{
+ int nid;
+ struct mem_cgroup *memcg = get_mem_cgroup_from_mm(mm);
+ struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
+
+ VM_BUG_ON_MM(!list_empty(&mm->lrugen.list), mm);
+#ifdef CONFIG_MEMCG
+ VM_BUG_ON_MM(mm->lrugen.memcg, mm);
+ mm->lrugen.memcg = memcg;
+#endif
+ spin_lock(&mm_list->lock);
+
+ list_add_tail(&mm->lrugen.list, &mm_list->fifo);
+
+ for_each_node(nid) {
+ struct lruvec *lruvec = get_lruvec(nid, memcg);
+
+ if (!lruvec)
+ continue;
+
+ if (lruvec->mm_walk.tail == &mm_list->fifo)
+ lruvec->mm_walk.tail = lruvec->mm_walk.tail->prev;
+ }
+
+ spin_unlock(&mm_list->lock);
+}
+
+void lru_gen_del_mm(struct mm_struct *mm)
+{
+ int nid;
+ struct lru_gen_mm_list *mm_list;
+ struct mem_cgroup *memcg = NULL;
+
+ if (list_empty(&mm->lrugen.list))
+ return;
+
+#ifdef CONFIG_MEMCG
+ memcg = mm->lrugen.memcg;
+#endif
+ mm_list = get_mm_list(memcg);
+
+ spin_lock(&mm_list->lock);
+
+ for_each_node(nid) {
+ struct lruvec *lruvec = get_lruvec(nid, memcg);
+
+ if (!lruvec)
+ continue;
+
+ if (lruvec->mm_walk.tail == &mm->lrugen.list)
+ lruvec->mm_walk.tail = lruvec->mm_walk.tail->next;
+
+ if (lruvec->mm_walk.head != &mm->lrugen.list)
+ continue;
+
+ lruvec->mm_walk.head = lruvec->mm_walk.head->next;
+ if (lruvec->mm_walk.head == &mm_list->fifo)
+ WRITE_ONCE(lruvec->mm_walk.seq, lruvec->mm_walk.seq + 1);
+ }
+
+ list_del_init(&mm->lrugen.list);
+
+ spin_unlock(&mm_list->lock);
+
+#ifdef CONFIG_MEMCG
+ mem_cgroup_put(mm->lrugen.memcg);
+ mm->lrugen.memcg = NULL;
+#endif
+}
+
+#ifdef CONFIG_MEMCG
+void lru_gen_migrate_mm(struct mm_struct *mm)
+{
+ struct mem_cgroup *memcg;
+
+ lockdep_assert_held(&mm->owner->alloc_lock);
+
+ if (mem_cgroup_disabled())
+ return;
+
+ rcu_read_lock();
+ memcg = mem_cgroup_from_task(mm->owner);
+ rcu_read_unlock();
+ if (memcg == mm->lrugen.memcg)
+ return;
+
+ VM_BUG_ON_MM(!mm->lrugen.memcg, mm);
+ VM_BUG_ON_MM(list_empty(&mm->lrugen.list), mm);
+
+ lru_gen_del_mm(mm);
+ lru_gen_add_mm(mm);
+}
+#endif
+
+#define BLOOM_FILTER_SHIFT 15
+
+static inline int filter_gen_from_seq(unsigned long seq)
+{
+ return seq % NR_BLOOM_FILTERS;
+}
+
+static void get_item_key(void *item, int *key)
+{
+ u32 hash = hash_ptr(item, BLOOM_FILTER_SHIFT * 2);
+
+ BUILD_BUG_ON(BLOOM_FILTER_SHIFT * 2 > BITS_PER_TYPE(u32));
+
+ key[0] = hash & (BIT(BLOOM_FILTER_SHIFT) - 1);
+ key[1] = hash >> BLOOM_FILTER_SHIFT;
+}
+
+static void clear_bloom_filter(struct lruvec *lruvec, unsigned long seq)
+{
+ unsigned long *filter;
+ int gen = filter_gen_from_seq(seq);
+
+ lockdep_assert_held(&get_mm_list(lruvec_memcg(lruvec))->lock);
+
+ filter = lruvec->mm_walk.filters[gen];
+ if (filter) {
+ bitmap_clear(filter, 0, BIT(BLOOM_FILTER_SHIFT));
+ return;
+ }
+
+ filter = bitmap_zalloc(BIT(BLOOM_FILTER_SHIFT), GFP_ATOMIC);
+ WRITE_ONCE(lruvec->mm_walk.filters[gen], filter);
+}
+
+static void set_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item)
+{
+ int key[2];
+ unsigned long *filter;
+ int gen = filter_gen_from_seq(seq);
+
+ filter = READ_ONCE(lruvec->mm_walk.filters[gen]);
+ if (!filter)
+ return;
+
+ get_item_key(item, key);
+
+ if (!test_bit(key[0], filter))
+ set_bit(key[0], filter);
+ if (!test_bit(key[1], filter))
+ set_bit(key[1], filter);
+}
+
+static bool test_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item)
+{
+ int key[2];
+ unsigned long *filter;
+ int gen = filter_gen_from_seq(seq);
+
+ filter = READ_ONCE(lruvec->mm_walk.filters[gen]);
+ if (!filter)
+ return false;
+
+ get_item_key(item, key);
+
+ return test_bit(key[0], filter) && test_bit(key[1], filter);
+}
+
+static void reset_mm_stats(struct lruvec *lruvec, bool last, struct mm_walk_args *args)
+{
+ int i;
+ int hist = lru_hist_from_seq(args->max_seq);
+
+ lockdep_assert_held(&get_mm_list(lruvec_memcg(lruvec))->lock);
+
+ for (i = 0; i < NR_MM_STATS; i++) {
+ WRITE_ONCE(lruvec->mm_walk.stats[hist][i],
+ lruvec->mm_walk.stats[hist][i] + args->mm_stats[i]);
+ args->mm_stats[i] = 0;
+ }
+
+ if (!last || NR_HIST_GENS == 1)
+ return;
+
+ hist = lru_hist_from_seq(args->max_seq + 1);
+ for (i = 0; i < NR_MM_STATS; i++)
+ WRITE_ONCE(lruvec->mm_walk.stats[hist][i], 0);
+}
+
+static bool should_skip_mm(struct mm_struct *mm, struct mm_walk_args *args)
+{
+ int type;
+ unsigned long size = 0;
+
+ if (cpumask_empty(mm_cpumask(mm)) && !node_isset(args->node_id, mm->lrugen.nodes))
+ return true;
+
+ if (mm_is_oom_victim(mm))
+ return true;
+
+ for (type = !args->swappiness; type < ANON_AND_FILE; type++) {
+ size += type ? get_mm_counter(mm, MM_FILEPAGES) :
+ get_mm_counter(mm, MM_ANONPAGES) +
+ get_mm_counter(mm, MM_SHMEMPAGES);
+ }
+
+ if (size < MIN_BATCH_SIZE)
+ return true;
+
+ if (!mmget_not_zero(mm))
+ return true;
+
+ node_clear(args->node_id, mm->lrugen.nodes);
+
+ return false;
+}
+
+/* To support multiple walkers that concurrently walk an mm_struct list. */
+static bool get_next_mm(struct lruvec *lruvec, struct mm_walk_args *args,
+ struct mm_struct **iter)
+{
+ bool first = false;
+ bool last = true;
+ struct mm_struct *mm = NULL;
+ struct lru_gen_mm_walk *mm_walk = &lruvec->mm_walk;
+ struct lru_gen_mm_list *mm_list = get_mm_list(args->memcg);
+
+ if (*iter)
+ mmput_async(*iter);
+ else if (args->max_seq <= READ_ONCE(mm_walk->seq))
+ return false;
+
+ spin_lock(&mm_list->lock);
+
+ VM_BUG_ON(args->max_seq > mm_walk->seq + 1);
+ VM_BUG_ON(*iter && args->max_seq < mm_walk->seq);
+ VM_BUG_ON(*iter && !mm_walk->nr_walkers);
+
+ if (args->max_seq <= mm_walk->seq) {
+ if (!*iter)
+ last = false;
+ goto done;
+ }
+
+ if (mm_walk->head == &mm_list->fifo) {
+ VM_BUG_ON(mm_walk->nr_walkers);
+ mm_walk->head = mm_walk->head->next;
+ first = true;
+ }
+
+ while (!mm && mm_walk->head != &mm_list->fifo) {
+ mm = list_entry(mm_walk->head, struct mm_struct, lrugen.list);
+
+ mm_walk->head = mm_walk->head->next;
+
+ if (mm_walk->tail == &mm->lrugen.list) {
+ mm_walk->tail = mm_walk->tail->next;
+ args->use_filter = false;
+ }
+
+ if (should_skip_mm(mm, args))
+ mm = NULL;
+ }
+
+ if (mm_walk->head == &mm_list->fifo)
+ WRITE_ONCE(mm_walk->seq, mm_walk->seq + 1);
+done:
+ if (*iter && !mm)
+ mm_walk->nr_walkers--;
+ if (!*iter && mm)
+ mm_walk->nr_walkers++;
+
+ if (mm_walk->nr_walkers)
+ last = false;
+
+ if (mm && first)
+ clear_bloom_filter(lruvec, args->max_seq + 1);
+
+ if (*iter || last)
+ reset_mm_stats(lruvec, last, args);
+
+ spin_unlock(&mm_list->lock);
+
+ *iter = mm;
+
+ return last;
+}
+
+/******************************************************************************
* state change
******************************************************************************/
@@ -3112,6 +3412,7 @@ void lru_gen_init_state(struct mem_cgrou
int i;
int gen, type, zone;
struct lrugen *lrugen = &lruvec->evictable;
+ struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
lrugen->max_seq = MIN_NR_GENS + 1;
lrugen->enabled[0] = lru_gen_enabled() && lru_gen_nr_swapfiles;
@@ -3122,6 +3423,17 @@ void lru_gen_init_state(struct mem_cgrou
for_each_gen_type_zone(gen, type, zone)
INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]);
+
+ if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG) && !memcg)
+ spin_lock(&mm_list->lock);
+
+ lruvec->mm_walk.seq = MIN_NR_GENS;
+ lruvec->mm_walk.head = &mm_list->fifo;
+ lruvec->mm_walk.tail = &mm_list->fifo;
+ init_waitqueue_head(&lruvec->mm_walk.wait);
+
+ if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG) && !memcg)
+ spin_unlock(&mm_list->lock);
}
#ifdef CONFIG_MEMCG
@@ -3129,18 +3441,37 @@ void lru_gen_init_memcg(struct mem_cgrou
{
int nid;
+ INIT_LIST_HEAD(&memcg->mm_list.fifo);
+ spin_lock_init(&memcg->mm_list.lock);
+
for_each_node(nid) {
struct lruvec *lruvec = get_lruvec(nid, memcg);
lru_gen_init_state(memcg, lruvec);
}
}
+
+void lru_gen_free_memcg(struct mem_cgroup *memcg)
+{
+ int nid;
+
+ for_each_node(nid) {
+ int i;
+ struct lruvec *lruvec = get_lruvec(nid, memcg);
+
+ for (i = 0; i < NR_BLOOM_FILTERS; i++) {
+ bitmap_free(lruvec->mm_walk.filters[i]);
+ lruvec->mm_walk.filters[i] = NULL;
+ }
+ }
+}
#endif
static int __init init_lru_gen(void)
{
BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS);
BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS);
+ BUILD_BUG_ON(sizeof(MM_STAT_CODES) != NR_MM_STATS + 1);
return 0;
};

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,496 @@
From 5cc7fdec54e87e32b4fb0f07d84b21769d5f8d92 Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Mon, 25 Jan 2021 21:38:02 -0700
Subject: [PATCH 08/10] mm: multigenerational lru: user interface
Add /sys/kernel/mm/lru_gen/enabled to enable and disable the
multigenerational lru at runtime.
Add /sys/kernel/mm/lru_gen/min_ttl_ms to protect the working set of a
given number of milliseconds. The OOM killer is invoked if this
working set cannot be kept in memory.
Add /sys/kernel/debug/lru_gen to monitor the multigenerational lru and
invoke the aging and the eviction. This file has the following output:
memcg memcg_id memcg_path
node node_id
min_gen birth_time anon_size file_size
...
max_gen birth_time anon_size file_size
min_gen is the oldest generation number and max_gen is the youngest
generation number. birth_time is in milliseconds. anon_size and
file_size are in pages.
This file takes the following input:
+ memcg_id node_id max_gen [swappiness] [use_bloom_filter]
- memcg_id node_id min_gen [swappiness] [nr_to_reclaim]
The first command line invokes the aging, which scans PTEs for
accessed pages and then creates the next generation max_gen+1. A swap
file and a non-zero swappiness, which overrides vm.swappiness, are
required to scan PTEs mapping anon pages. The second command line
invokes the eviction, which evicts generations less than or equal to
min_gen. min_gen should be less than max_gen-1 as max_gen and
max_gen-1 are not fully aged and therefore cannot be evicted.
Setting nr_to_reclaim to N limits the number of pages to evict.
Setting use_bloom_filter to 0 overrides the default behavior which
only scans PTE tables found populated. Multiple command lines are
supported, as is concatenation with delimiters "," and ";".
Signed-off-by: Yu Zhao <yuzhao@google.com>
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
Change-Id: I4448e60029badbe347aa3b624f429b280cc3a3d3
---
include/linux/nodemask.h | 1 +
mm/vmscan.c | 415 +++++++++++++++++++++++++++++++++++++++
2 files changed, 416 insertions(+)
--- a/include/linux/nodemask.h
+++ b/include/linux/nodemask.h
@@ -485,6 +485,7 @@ static inline int num_node_state(enum no
#define first_online_node 0
#define first_memory_node 0
#define next_online_node(nid) (MAX_NUMNODES)
+#define next_memory_node(nid) (MAX_NUMNODES)
#define nr_node_ids 1U
#define nr_online_nodes 1U
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -53,6 +53,8 @@
#include <linux/memory.h>
#include <linux/pagewalk.h>
#include <linux/shmem_fs.h>
+#include <linux/ctype.h>
+#include <linux/debugfs.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
@@ -4882,6 +4884,413 @@ unlock:
}
/******************************************************************************
+ * sysfs interface
+ ******************************************************************************/
+
+static ssize_t show_min_ttl(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%u\n", jiffies_to_msecs(READ_ONCE(lru_gen_min_ttl)));
+}
+
+static ssize_t store_min_ttl(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t len)
+{
+ unsigned int msecs;
+
+ if (kstrtouint(buf, 10, &msecs))
+ return -EINVAL;
+
+ WRITE_ONCE(lru_gen_min_ttl, msecs_to_jiffies(msecs));
+
+ return len;
+}
+
+static struct kobj_attribute lru_gen_min_ttl_attr = __ATTR(
+ min_ttl_ms, 0644, show_min_ttl, store_min_ttl
+);
+
+static ssize_t show_enable(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%d\n", lru_gen_enabled());
+}
+
+static ssize_t store_enable(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t len)
+{
+ bool enable;
+
+ if (kstrtobool(buf, &enable))
+ return -EINVAL;
+
+ lru_gen_change_state(enable, true, false);
+
+ return len;
+}
+
+static struct kobj_attribute lru_gen_enabled_attr = __ATTR(
+ enabled, 0644, show_enable, store_enable
+);
+
+static struct attribute *lru_gen_attrs[] = {
+ &lru_gen_min_ttl_attr.attr,
+ &lru_gen_enabled_attr.attr,
+ NULL
+};
+
+static struct attribute_group lru_gen_attr_group = {
+ .name = "lru_gen",
+ .attrs = lru_gen_attrs,
+};
+
+/******************************************************************************
+ * debugfs interface
+ ******************************************************************************/
+
+static void *lru_gen_seq_start(struct seq_file *m, loff_t *pos)
+{
+ struct mem_cgroup *memcg;
+ loff_t nr_to_skip = *pos;
+
+ m->private = kvmalloc(PATH_MAX, GFP_KERNEL);
+ if (!m->private)
+ return ERR_PTR(-ENOMEM);
+
+ memcg = mem_cgroup_iter(NULL, NULL, NULL);
+ do {
+ int nid;
+
+ for_each_node_state(nid, N_MEMORY) {
+ if (!nr_to_skip--)
+ return get_lruvec(nid, memcg);
+ }
+ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
+
+ return NULL;
+}
+
+static void lru_gen_seq_stop(struct seq_file *m, void *v)
+{
+ if (!IS_ERR_OR_NULL(v))
+ mem_cgroup_iter_break(NULL, lruvec_memcg(v));
+
+ kvfree(m->private);
+ m->private = NULL;
+}
+
+static void *lru_gen_seq_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ int nid = lruvec_pgdat(v)->node_id;
+ struct mem_cgroup *memcg = lruvec_memcg(v);
+
+ ++*pos;
+
+ nid = next_memory_node(nid);
+ if (nid == MAX_NUMNODES) {
+ memcg = mem_cgroup_iter(NULL, memcg, NULL);
+ if (!memcg)
+ return NULL;
+
+ nid = first_memory_node;
+ }
+
+ return get_lruvec(nid, memcg);
+}
+
+static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec,
+ unsigned long max_seq, unsigned long *min_seq,
+ unsigned long seq)
+{
+ int i;
+ int type, tier;
+ int hist = lru_hist_from_seq(seq);
+ struct lrugen *lrugen = &lruvec->evictable;
+
+ for (tier = 0; tier < MAX_NR_TIERS; tier++) {
+ seq_printf(m, " %10d", tier);
+ for (type = 0; type < ANON_AND_FILE; type++) {
+ unsigned long n[3] = {};
+
+ if (seq == max_seq) {
+ n[0] = READ_ONCE(lrugen->avg_refaulted[type][tier]);
+ n[1] = READ_ONCE(lrugen->avg_total[type][tier]);
+
+ seq_printf(m, " %10luR %10luT %10lu ", n[0], n[1], n[2]);
+ } else if (seq == min_seq[type] || NR_HIST_GENS > 1) {
+ n[0] = atomic_long_read(&lrugen->refaulted[hist][type][tier]);
+ n[1] = atomic_long_read(&lrugen->evicted[hist][type][tier]);
+ if (tier)
+ n[2] = READ_ONCE(lrugen->protected[hist][type][tier - 1]);
+
+ seq_printf(m, " %10lur %10lue %10lup", n[0], n[1], n[2]);
+ } else
+ seq_puts(m, " 0 0 0 ");
+ }
+ seq_putc(m, '\n');
+ }
+
+ seq_puts(m, " ");
+ for (i = 0; i < NR_MM_STATS; i++) {
+ if (seq == max_seq && NR_HIST_GENS == 1)
+ seq_printf(m, " %10lu%c", READ_ONCE(lruvec->mm_walk.stats[hist][i]),
+ toupper(MM_STAT_CODES[i]));
+ else if (seq != max_seq && NR_HIST_GENS > 1)
+ seq_printf(m, " %10lu%c", READ_ONCE(lruvec->mm_walk.stats[hist][i]),
+ MM_STAT_CODES[i]);
+ else
+ seq_puts(m, " 0 ");
+ }
+ seq_putc(m, '\n');
+}
+
+static int lru_gen_seq_show(struct seq_file *m, void *v)
+{
+ unsigned long seq;
+ bool full = !debugfs_real_fops(m->file)->write;
+ struct lruvec *lruvec = v;
+ struct lrugen *lrugen = &lruvec->evictable;
+ int nid = lruvec_pgdat(lruvec)->node_id;
+ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+ DEFINE_MAX_SEQ(lruvec);
+ DEFINE_MIN_SEQ(lruvec);
+
+ if (nid == first_memory_node) {
+ const char *path = memcg ? m->private : "";
+
+#ifdef CONFIG_MEMCG
+ if (memcg)
+ cgroup_path(memcg->css.cgroup, m->private, PATH_MAX);
+#endif
+ seq_printf(m, "memcg %5hu %s\n", mem_cgroup_id(memcg), path);
+ }
+
+ seq_printf(m, " node %5d\n", nid);
+
+ if (!full)
+ seq = min_seq[0];
+ else if (max_seq >= MAX_NR_GENS)
+ seq = max_seq - MAX_NR_GENS + 1;
+ else
+ seq = 0;
+
+ for (; seq <= max_seq; seq++) {
+ int gen, type, zone;
+ unsigned int msecs;
+
+ gen = lru_gen_from_seq(seq);
+ msecs = jiffies_to_msecs(jiffies - READ_ONCE(lrugen->timestamps[gen]));
+
+ seq_printf(m, " %10lu %10u", seq, msecs);
+
+ for (type = 0; type < ANON_AND_FILE; type++) {
+ long size = 0;
+
+ if (seq < min_seq[type]) {
+ seq_puts(m, " -0 ");
+ continue;
+ }
+
+ for (zone = 0; zone < MAX_NR_ZONES; zone++)
+ size += READ_ONCE(lrugen->sizes[gen][type][zone]);
+
+ seq_printf(m, " %10lu ", max(size, 0L));
+ }
+
+ seq_putc(m, '\n');
+
+ if (full)
+ lru_gen_seq_show_full(m, lruvec, max_seq, min_seq, seq);
+ }
+
+ return 0;
+}
+
+static const struct seq_operations lru_gen_seq_ops = {
+ .start = lru_gen_seq_start,
+ .stop = lru_gen_seq_stop,
+ .next = lru_gen_seq_next,
+ .show = lru_gen_seq_show,
+};
+
+static int run_aging(struct lruvec *lruvec, struct scan_control *sc, int swappiness,
+ unsigned long seq, bool use_filter)
+{
+ DEFINE_MAX_SEQ(lruvec);
+
+ if (seq == max_seq)
+ try_to_inc_max_seq(lruvec, sc, swappiness, max_seq, use_filter);
+
+ return seq > max_seq ? -EINVAL : 0;
+}
+
+static int run_eviction(struct lruvec *lruvec, struct scan_control *sc, int swappiness,
+ unsigned long seq, unsigned long nr_to_reclaim)
+{
+ struct blk_plug plug;
+ int err = -EINTR;
+ DEFINE_MAX_SEQ(lruvec);
+
+ if (seq >= max_seq - 1)
+ return -EINVAL;
+
+ sc->nr_reclaimed = 0;
+
+ blk_start_plug(&plug);
+
+ while (!signal_pending(current)) {
+ DEFINE_MIN_SEQ(lruvec);
+
+ if (seq < min_seq[!swappiness] || sc->nr_reclaimed >= nr_to_reclaim ||
+ !evict_pages(lruvec, sc, swappiness)) {
+ err = 0;
+ break;
+ }
+
+ cond_resched();
+ }
+
+ blk_finish_plug(&plug);
+
+ return err;
+}
+
+static int run_cmd(char cmd, int memcg_id, int nid, struct scan_control *sc,
+ int swappiness, unsigned long seq, unsigned long opt)
+{
+ struct lruvec *lruvec;
+ int err = -EINVAL;
+ struct mem_cgroup *memcg = NULL;
+
+ if (!mem_cgroup_disabled()) {
+ rcu_read_lock();
+ memcg = mem_cgroup_from_id(memcg_id);
+#ifdef CONFIG_MEMCG
+ if (memcg && !css_tryget(&memcg->css))
+ memcg = NULL;
+#endif
+ rcu_read_unlock();
+
+ if (!memcg)
+ goto done;
+ }
+ if (memcg_id != mem_cgroup_id(memcg))
+ goto done;
+
+ if (nid < 0 || nid >= MAX_NUMNODES || !node_state(nid, N_MEMORY))
+ goto done;
+
+ lruvec = get_lruvec(nid, memcg);
+
+ if (swappiness < 0)
+ swappiness = get_swappiness(memcg);
+ else if (swappiness > 200)
+ goto done;
+
+ switch (cmd) {
+ case '+':
+ err = run_aging(lruvec, sc, swappiness, seq, opt);
+ break;
+ case '-':
+ err = run_eviction(lruvec, sc, swappiness, seq, opt);
+ break;
+ }
+done:
+ mem_cgroup_put(memcg);
+
+ return err;
+}
+
+static ssize_t lru_gen_seq_write(struct file *file, const char __user *src,
+ size_t len, loff_t *pos)
+{
+ void *buf;
+ char *cur, *next;
+ unsigned int flags;
+ int err = 0;
+ struct scan_control sc = {
+ .may_writepage = 1,
+ .may_unmap = 1,
+ .may_swap = 1,
+ .reclaim_idx = MAX_NR_ZONES - 1,
+ .gfp_mask = GFP_KERNEL,
+ };
+
+ buf = kvmalloc(len + 1, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ if (copy_from_user(buf, src, len)) {
+ kvfree(buf);
+ return -EFAULT;
+ }
+
+ next = buf;
+ next[len] = '\0';
+
+ sc.reclaim_state.mm_walk_args = alloc_mm_walk_args();
+ if (!sc.reclaim_state.mm_walk_args) {
+ kvfree(buf);
+ return -ENOMEM;
+ }
+
+ flags = memalloc_noreclaim_save();
+ set_task_reclaim_state(current, &sc.reclaim_state);
+
+ while ((cur = strsep(&next, ",;\n"))) {
+ int n;
+ int end;
+ char cmd;
+ unsigned int memcg_id;
+ unsigned int nid;
+ unsigned long seq;
+ unsigned int swappiness = -1;
+ unsigned long opt = -1;
+
+ cur = skip_spaces(cur);
+ if (!*cur)
+ continue;
+
+ n = sscanf(cur, "%c %u %u %lu %n %u %n %lu %n", &cmd, &memcg_id, &nid,
+ &seq, &end, &swappiness, &end, &opt, &end);
+ if (n < 4 || cur[end]) {
+ err = -EINVAL;
+ break;
+ }
+
+ err = run_cmd(cmd, memcg_id, nid, &sc, swappiness, seq, opt);
+ if (err)
+ break;
+ }
+
+ set_task_reclaim_state(current, NULL);
+ memalloc_noreclaim_restore(flags);
+
+ free_mm_walk_args(sc.reclaim_state.mm_walk_args);
+ kvfree(buf);
+
+ return err ? : len;
+}
+
+static int lru_gen_seq_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &lru_gen_seq_ops);
+}
+
+static const struct file_operations lru_gen_rw_fops = {
+ .open = lru_gen_seq_open,
+ .read = seq_read,
+ .write = lru_gen_seq_write,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static const struct file_operations lru_gen_ro_fops = {
+ .open = lru_gen_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+/******************************************************************************
* initialization
******************************************************************************/
@@ -4951,6 +5360,12 @@ static int __init init_lru_gen(void)
BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS);
BUILD_BUG_ON(sizeof(MM_STAT_CODES) != NR_MM_STATS + 1);
+ if (sysfs_create_group(mm_kobj, &lru_gen_attr_group))
+ pr_err("lru_gen: failed to create sysfs group\n");
+
+ debugfs_create_file("lru_gen", 0644, NULL, NULL, &lru_gen_rw_fops);
+ debugfs_create_file("lru_gen_full", 0444, NULL, NULL, &lru_gen_ro_fops);
+
return 0;
};
late_initcall(init_lru_gen);

View File

@ -0,0 +1,80 @@
From 3008095eb835d207dd7e5b60899aad17f32aa9f7 Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Mon, 25 Jan 2021 21:47:24 -0700
Subject: [PATCH 09/10] mm: multigenerational lru: Kconfig
Add configuration options for the multigenerational lru.
Signed-off-by: Yu Zhao <yuzhao@google.com>
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
Change-Id: Ic74ea07f8fb5f56e6904a1b80c3c286bc2911635
---
mm/Kconfig | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 59 insertions(+)
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -899,4 +899,63 @@ config SECRETMEM
source "mm/damon/Kconfig"
+# the multigenerational lru {
+config LRU_GEN
+ bool "Multigenerational LRU"
+ depends on MMU
+ # the following options may leave not enough spare bits in page->flags
+ depends on !MAXSMP && (64BIT || !SPARSEMEM || SPARSEMEM_VMEMMAP)
+ help
+ A high performance LRU implementation to heavily overcommit workloads
+ that are not IO bound. See Documentation/vm/multigen_lru.rst for
+ details.
+
+ Warning: do not enable this option unless you plan to use it because
+ it introduces a small per-process and per-memcg and per-node memory
+ overhead.
+
+config LRU_GEN_ENABLED
+ bool "Turn on by default"
+ depends on LRU_GEN
+ help
+ The default value of /sys/kernel/mm/lru_gen/enabled is 0. This option
+ changes it to 1.
+
+ Warning: the default value is the fast path. See
+ Documentation/static-keys.txt for details.
+
+config LRU_GEN_STATS
+ bool "Full stats for debugging"
+ depends on LRU_GEN
+ help
+ This option keeps full stats for each generation, which can be read
+ from /sys/kernel/debug/lru_gen_full.
+
+ Warning: do not enable this option unless you plan to use it because
+ it introduces an additional small per-process and per-memcg and
+ per-node memory overhead.
+
+config NR_LRU_GENS
+ int "Max number of generations"
+ depends on LRU_GEN
+ range 4 31
+ default 7
+ help
+ This will use order_base_2(N+1) spare bits from page flags.
+
+ Warning: do not use numbers larger than necessary because each
+ generation introduces a small per-node and per-memcg memory overhead.
+
+config TIERS_PER_GEN
+ int "Number of tiers per generation"
+ depends on LRU_GEN
+ range 2 5
+ default 4
+ help
+ This will use N-2 spare bits from page flags.
+
+ Larger values generally offer better protection to active pages under
+ heavy buffered I/O workloads.
+# }
+
endmenu

View File

@ -0,0 +1,161 @@
From f59c618ed70a1e48accc4cad91a200966f2569c9 Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Tue, 2 Feb 2021 01:27:45 -0700
Subject: [PATCH 10/10] mm: multigenerational lru: documentation
Add Documentation/vm/multigen_lru.rst.
Signed-off-by: Yu Zhao <yuzhao@google.com>
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
Change-Id: I1902178bcbb5adfa0a748c4d284a6456059bdd7e
---
Documentation/vm/index.rst | 1 +
Documentation/vm/multigen_lru.rst | 132 ++++++++++++++++++++++++++++++
2 files changed, 133 insertions(+)
create mode 100644 Documentation/vm/multigen_lru.rst
--- a/Documentation/vm/index.rst
+++ b/Documentation/vm/index.rst
@@ -17,6 +17,7 @@ various features of the Linux memory man
swap_numa
zswap
+ multigen_lru
Kernel developers MM documentation
==================================
--- /dev/null
+++ b/Documentation/vm/multigen_lru.rst
@@ -0,0 +1,132 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=====================
+Multigenerational LRU
+=====================
+
+Quick Start
+===========
+Build Configurations
+--------------------
+:Required: Set ``CONFIG_LRU_GEN=y``.
+
+:Optional: Set ``CONFIG_LRU_GEN_ENABLED=y`` to turn the feature on by
+ default.
+
+Runtime Configurations
+----------------------
+:Required: Write ``1`` to ``/sys/kernel/mm/lru_gen/enable`` if the
+ feature was not turned on by default.
+
+:Optional: Write ``N`` to ``/sys/kernel/mm/lru_gen/min_ttl_ms`` to
+ protect the working set of ``N`` milliseconds. The OOM killer is
+ invoked if this working set cannot be kept in memory.
+
+:Optional: Read ``/sys/kernel/debug/lru_gen`` to confirm the feature
+ is turned on. This file has the following output:
+
+::
+
+ memcg memcg_id memcg_path
+ node node_id
+ min_gen birth_time anon_size file_size
+ ...
+ max_gen birth_time anon_size file_size
+
+``min_gen`` is the oldest generation number and ``max_gen`` is the
+youngest generation number. ``birth_time`` is in milliseconds.
+``anon_size`` and ``file_size`` are in pages.
+
+Phones/Laptops/Workstations
+---------------------------
+No additional configurations required.
+
+Servers/Data Centers
+--------------------
+:To support more generations: Change ``CONFIG_NR_LRU_GENS`` to a
+ larger number.
+
+:To support more tiers: Change ``CONFIG_TIERS_PER_GEN`` to a larger
+ number.
+
+:To support full stats: Set ``CONFIG_LRU_GEN_STATS=y``.
+
+:Working set estimation: Write ``+ memcg_id node_id max_gen
+ [swappiness] [use_bloom_filter]`` to ``/sys/kernel/debug/lru_gen`` to
+ invoke the aging, which scans PTEs for accessed pages and then
+ creates the next generation ``max_gen+1``. A swap file and a non-zero
+ ``swappiness``, which overrides ``vm.swappiness``, are required to
+ scan PTEs mapping anon pages. Set ``use_bloom_filter`` to 0 to
+ override the default behavior which only scans PTE tables found
+ populated.
+
+:Proactive reclaim: Write ``- memcg_id node_id min_gen [swappiness]
+ [nr_to_reclaim]`` to ``/sys/kernel/debug/lru_gen`` to invoke the
+ eviction, which evicts generations less than or equal to ``min_gen``.
+ ``min_gen`` should be less than ``max_gen-1`` as ``max_gen`` and
+ ``max_gen-1`` are not fully aged and therefore cannot be evicted.
+ Use ``nr_to_reclaim`` to limit the number of pages to evict. Multiple
+ command lines are supported, so does concatenation with delimiters
+ ``,`` and ``;``.
+
+Framework
+=========
+For each ``lruvec``, evictable pages are divided into multiple
+generations. The youngest generation number is stored in
+``lrugen->max_seq`` for both anon and file types as they are aged on
+an equal footing. The oldest generation numbers are stored in
+``lrugen->min_seq[]`` separately for anon and file types as clean
+file pages can be evicted regardless of swap and writeback
+constraints. These three variables are monotonically increasing.
+Generation numbers are truncated into
+``order_base_2(CONFIG_NR_LRU_GENS+1)`` bits in order to fit into
+``page->flags``. The sliding window technique is used to prevent
+truncated generation numbers from overlapping. Each truncated
+generation number is an index to an array of per-type and per-zone
+lists ``lrugen->lists``.
+
+Each generation is divided into multiple tiers. Tiers represent
+different ranges of numbers of accesses from file descriptors only.
+Pages accessed ``N`` times via file descriptors belong to tier
+``order_base_2(N)``. Each generation contains at most
+``CONFIG_TIERS_PER_GEN`` tiers, and they require additional
+``CONFIG_TIERS_PER_GEN-2`` bits in ``page->flags``. In contrast to
+moving between generations which requires list operations, moving
+between tiers only involves operations on ``page->flags`` and
+therefore has a negligible cost. A feedback loop modeled after the PID
+controller monitors refaulted % across all tiers and decides when to
+protect pages from which tiers.
+
+The framework comprises two conceptually independent components: the
+aging and the eviction, which can be invoked separately from user
+space for the purpose of working set estimation and proactive reclaim.
+
+Aging
+-----
+The aging produces young generations. Given an ``lruvec``, the aging
+traverses ``lruvec_memcg()->mm_list`` and calls ``walk_page_range()``
+to scan PTEs for accessed pages (a ``mm_struct`` list is maintained
+for each ``memcg``). Upon finding one, the aging updates its
+generation number to ``max_seq`` (modulo ``CONFIG_NR_LRU_GENS``).
+After each round of traversal, the aging increments ``max_seq``. The
+aging is due when ``min_seq[]`` reaches ``max_seq-1``.
+
+Eviction
+--------
+The eviction consumes old generations. Given an ``lruvec``, the
+eviction scans pages on the per-zone lists indexed by anon and file
+``min_seq[]`` (modulo ``CONFIG_NR_LRU_GENS``). It first tries to
+select a type based on the values of ``min_seq[]``. If they are
+equal, it selects the type that has a lower refaulted %. The eviction
+sorts a page according to its updated generation number if the aging
+has found this page accessed. It also moves a page to the next
+generation if this page is from an upper tier that has a higher
+refaulted % than the base tier. The eviction increments ``min_seq[]``
+of a selected type when it finds all the per-zone lists indexed by
+``min_seq[]`` of this selected type are empty.
+
+To-do List
+==========
+KVM Optimization
+----------------
+Support shadow page table walk.