mirror of
https://github.com/openwrt/openwrt.git
synced 2025-04-14 06:27:03 +00:00
kernel: add and enable MGLRU for Linux 5.15
Backport a preliminary version of Yu Zhao's multi-generational LRU, for improved memory management. Refresh the patches while at it. Signed-off-by: Rui Salvaterra <rsalvaterra@gmail.com>
This commit is contained in:
parent
0be1b78856
commit
05158082f6
@ -3195,6 +3195,9 @@ CONFIG_LOG_CPU_MAX_BUF_SHIFT=12
|
||||
# CONFIG_LPC_ICH is not set
|
||||
# CONFIG_LPC_SCH is not set
|
||||
# CONFIG_LP_CONSOLE is not set
|
||||
CONFIG_LRU_GEN=y
|
||||
CONFIG_LRU_GEN_ENABLED=y
|
||||
# CONFIG_LRU_GEN_STATS is not set
|
||||
# CONFIG_LSI_ET1011C_PHY is not set
|
||||
CONFIG_LSM="lockdown,yama,loadpin,safesetid,integrity"
|
||||
CONFIG_LSM_MMAP_MIN_ADDR=65536
|
||||
@ -4388,6 +4391,7 @@ CONFIG_NMI_LOG_BUF_SHIFT=13
|
||||
# CONFIG_NO_HZ is not set
|
||||
# CONFIG_NO_HZ_FULL is not set
|
||||
# CONFIG_NO_HZ_IDLE is not set
|
||||
CONFIG_NR_LRU_GENS=7
|
||||
# CONFIG_NS83820 is not set
|
||||
# CONFIG_NTB is not set
|
||||
# CONFIG_NTFS3_64BIT_CLUSTER is not set
|
||||
@ -6480,6 +6484,7 @@ CONFIG_THIN_ARCHIVES=y
|
||||
# CONFIG_THUNDER_NIC_VF is not set
|
||||
# CONFIG_TICK_CPU_ACCOUNTING is not set
|
||||
CONFIG_TICK_ONESHOT=y
|
||||
CONFIG_TIERS_PER_GEN=4
|
||||
# CONFIG_TIFM_CORE is not set
|
||||
# CONFIG_TIGON3 is not set
|
||||
# CONFIG_TIMB_DMA is not set
|
||||
|
@ -0,0 +1,169 @@
|
||||
From a8e6015d9534f39abc08e6804566af059e498a60 Mon Sep 17 00:00:00 2001
|
||||
From: Yu Zhao <yuzhao@google.com>
|
||||
Date: Wed, 4 Aug 2021 01:31:34 -0600
|
||||
Subject: [PATCH 01/10] mm: x86, arm64: add arch_has_hw_pte_young()
|
||||
|
||||
Some architectures automatically set the accessed bit in PTEs, e.g.,
|
||||
x86 and arm64 v8.2. On architectures that do not have this capability,
|
||||
clearing the accessed bit in a PTE triggers a page fault following the
|
||||
TLB miss of this PTE.
|
||||
|
||||
Being aware of this capability can help make better decisions, i.e.,
|
||||
whether to limit the size of each batch of PTEs and the burst of
|
||||
batches when clearing the accessed bit.
|
||||
|
||||
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
||||
Change-Id: Ib49b44fb56df3333a2ff1fcc496fb1980b976e7a
|
||||
---
|
||||
arch/arm64/include/asm/cpufeature.h | 5 +++++
|
||||
arch/arm64/include/asm/pgtable.h | 13 ++++++++-----
|
||||
arch/arm64/kernel/cpufeature.c | 10 ++++++++++
|
||||
arch/arm64/tools/cpucaps | 1 +
|
||||
arch/x86/include/asm/pgtable.h | 6 +++---
|
||||
include/linux/pgtable.h | 13 +++++++++++++
|
||||
mm/memory.c | 14 +-------------
|
||||
7 files changed, 41 insertions(+), 21 deletions(-)
|
||||
|
||||
--- a/arch/arm64/include/asm/cpufeature.h
|
||||
+++ b/arch/arm64/include/asm/cpufeature.h
|
||||
@@ -808,6 +808,11 @@ static inline bool system_supports_tlb_r
|
||||
cpus_have_const_cap(ARM64_HAS_TLB_RANGE);
|
||||
}
|
||||
|
||||
+static inline bool system_has_hw_af(void)
|
||||
+{
|
||||
+ return IS_ENABLED(CONFIG_ARM64_HW_AFDBM) && cpus_have_const_cap(ARM64_HW_AF);
|
||||
+}
|
||||
+
|
||||
extern int do_emulate_mrs(struct pt_regs *regs, u32 sys_reg, u32 rt);
|
||||
|
||||
static inline u32 id_aa64mmfr0_parange_to_phys_shift(int parange)
|
||||
--- a/arch/arm64/include/asm/pgtable.h
|
||||
+++ b/arch/arm64/include/asm/pgtable.h
|
||||
@@ -999,13 +999,16 @@ static inline void update_mmu_cache(stru
|
||||
* page after fork() + CoW for pfn mappings. We don't always have a
|
||||
* hardware-managed access flag on arm64.
|
||||
*/
|
||||
-static inline bool arch_faults_on_old_pte(void)
|
||||
+static inline bool arch_has_hw_pte_young(bool local)
|
||||
{
|
||||
- WARN_ON(preemptible());
|
||||
+ if (local) {
|
||||
+ WARN_ON(preemptible());
|
||||
+ return cpu_has_hw_af();
|
||||
+ }
|
||||
|
||||
- return !cpu_has_hw_af();
|
||||
+ return system_has_hw_af();
|
||||
}
|
||||
-#define arch_faults_on_old_pte arch_faults_on_old_pte
|
||||
+#define arch_has_hw_pte_young arch_has_hw_pte_young
|
||||
|
||||
/*
|
||||
* Experimentally, it's cheap to set the access flag in hardware and we
|
||||
@@ -1013,7 +1016,7 @@ static inline bool arch_faults_on_old_pt
|
||||
*/
|
||||
static inline bool arch_wants_old_prefaulted_pte(void)
|
||||
{
|
||||
- return !arch_faults_on_old_pte();
|
||||
+ return arch_has_hw_pte_young(true);
|
||||
}
|
||||
#define arch_wants_old_prefaulted_pte arch_wants_old_prefaulted_pte
|
||||
|
||||
--- a/arch/arm64/kernel/cpufeature.c
|
||||
+++ b/arch/arm64/kernel/cpufeature.c
|
||||
@@ -2184,6 +2184,16 @@ static const struct arm64_cpu_capabiliti
|
||||
.matches = has_hw_dbm,
|
||||
.cpu_enable = cpu_enable_hw_dbm,
|
||||
},
|
||||
+ {
|
||||
+ .desc = "Hardware update of the Access flag",
|
||||
+ .type = ARM64_CPUCAP_SYSTEM_FEATURE,
|
||||
+ .capability = ARM64_HW_AF,
|
||||
+ .sys_reg = SYS_ID_AA64MMFR1_EL1,
|
||||
+ .sign = FTR_UNSIGNED,
|
||||
+ .field_pos = ID_AA64MMFR1_HADBS_SHIFT,
|
||||
+ .min_field_value = 1,
|
||||
+ .matches = has_cpuid_feature,
|
||||
+ },
|
||||
#endif
|
||||
{
|
||||
.desc = "CRC32 instructions",
|
||||
--- a/arch/arm64/tools/cpucaps
|
||||
+++ b/arch/arm64/tools/cpucaps
|
||||
@@ -35,6 +35,7 @@ HAS_STAGE2_FWB
|
||||
HAS_SYSREG_GIC_CPUIF
|
||||
HAS_TLB_RANGE
|
||||
HAS_VIRT_HOST_EXTN
|
||||
+HW_AF
|
||||
HW_DBM
|
||||
KVM_PROTECTED_MODE
|
||||
MISMATCHED_CACHE_TYPE
|
||||
--- a/arch/x86/include/asm/pgtable.h
|
||||
+++ b/arch/x86/include/asm/pgtable.h
|
||||
@@ -1397,10 +1397,10 @@ static inline bool arch_has_pfn_modify_c
|
||||
return boot_cpu_has_bug(X86_BUG_L1TF);
|
||||
}
|
||||
|
||||
-#define arch_faults_on_old_pte arch_faults_on_old_pte
|
||||
-static inline bool arch_faults_on_old_pte(void)
|
||||
+#define arch_has_hw_pte_young arch_has_hw_pte_young
|
||||
+static inline bool arch_has_hw_pte_young(bool local)
|
||||
{
|
||||
- return false;
|
||||
+ return true;
|
||||
}
|
||||
|
||||
#endif /* __ASSEMBLY__ */
|
||||
--- a/include/linux/pgtable.h
|
||||
+++ b/include/linux/pgtable.h
|
||||
@@ -259,6 +259,19 @@ static inline int pmdp_clear_flush_young
|
||||
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
|
||||
#endif
|
||||
|
||||
+#ifndef arch_has_hw_pte_young
|
||||
+/*
|
||||
+ * Return whether the accessed bit is supported by the local CPU or all CPUs.
|
||||
+ *
|
||||
+ * Those arches which have hw access flag feature need to implement their own
|
||||
+ * helper. By default, "false" means pagefault will be hit on old pte.
|
||||
+ */
|
||||
+static inline bool arch_has_hw_pte_young(bool local)
|
||||
+{
|
||||
+ return false;
|
||||
+}
|
||||
+#endif
|
||||
+
|
||||
#ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR
|
||||
static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
|
||||
unsigned long address,
|
||||
--- a/mm/memory.c
|
||||
+++ b/mm/memory.c
|
||||
@@ -121,18 +121,6 @@ int randomize_va_space __read_mostly =
|
||||
2;
|
||||
#endif
|
||||
|
||||
-#ifndef arch_faults_on_old_pte
|
||||
-static inline bool arch_faults_on_old_pte(void)
|
||||
-{
|
||||
- /*
|
||||
- * Those arches which don't have hw access flag feature need to
|
||||
- * implement their own helper. By default, "true" means pagefault
|
||||
- * will be hit on old pte.
|
||||
- */
|
||||
- return true;
|
||||
-}
|
||||
-#endif
|
||||
-
|
||||
#ifndef arch_wants_old_prefaulted_pte
|
||||
static inline bool arch_wants_old_prefaulted_pte(void)
|
||||
{
|
||||
@@ -2782,7 +2770,7 @@ static inline bool cow_user_page(struct
|
||||
* On architectures with software "accessed" bits, we would
|
||||
* take a double page fault, so mark it accessed here.
|
||||
*/
|
||||
- if (arch_faults_on_old_pte() && !pte_young(vmf->orig_pte)) {
|
||||
+ if (!arch_has_hw_pte_young(true) && !pte_young(vmf->orig_pte)) {
|
||||
pte_t entry;
|
||||
|
||||
vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl);
|
@ -0,0 +1,111 @@
|
||||
From f8b663bbfa30af5515e222fd74df20ea4e8393a2 Mon Sep 17 00:00:00 2001
|
||||
From: Yu Zhao <yuzhao@google.com>
|
||||
Date: Sat, 26 Sep 2020 21:17:18 -0600
|
||||
Subject: [PATCH 02/10] mm: x86: add CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG
|
||||
|
||||
Some architectures support the accessed bit on non-leaf PMD entries,
|
||||
e.g., x86_64 sets the accessed bit on a non-leaf PMD entry when using
|
||||
it as part of linear address translation [1]. As an optimization, page
|
||||
table walkers who are interested in the accessed bit can skip the PTEs
|
||||
under a non-leaf PMD entry if the accessed bit is cleared on this PMD
|
||||
entry.
|
||||
|
||||
Although an inline function may be preferable, this capability is
|
||||
added as a configuration option to look consistent when used with the
|
||||
existing macros.
|
||||
|
||||
[1]: Intel 64 and IA-32 Architectures Software Developer's Manual
|
||||
Volume 3 (June 2021), section 4.8
|
||||
|
||||
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
||||
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
|
||||
Change-Id: I1a17be3ae926f721f7b17ea1539e5c39e8c4f9a8
|
||||
---
|
||||
arch/Kconfig | 9 +++++++++
|
||||
arch/x86/Kconfig | 1 +
|
||||
arch/x86/include/asm/pgtable.h | 3 ++-
|
||||
arch/x86/mm/pgtable.c | 5 ++++-
|
||||
include/linux/pgtable.h | 4 ++--
|
||||
5 files changed, 18 insertions(+), 4 deletions(-)
|
||||
|
||||
--- a/arch/Kconfig
|
||||
+++ b/arch/Kconfig
|
||||
@@ -1295,6 +1295,15 @@ config ARCH_HAS_ELFCORE_COMPAT
|
||||
config ARCH_HAS_PARANOID_L1D_FLUSH
|
||||
bool
|
||||
|
||||
+config ARCH_HAS_NONLEAF_PMD_YOUNG
|
||||
+ bool
|
||||
+ depends on PGTABLE_LEVELS > 2
|
||||
+ help
|
||||
+ Architectures that select this are able to set the accessed bit on
|
||||
+ non-leaf PMD entries in addition to leaf PTE entries where pages are
|
||||
+ mapped. For them, page table walkers that clear the accessed bit may
|
||||
+ stop at non-leaf PMD entries if they do not see the accessed bit.
|
||||
+
|
||||
source "kernel/gcov/Kconfig"
|
||||
|
||||
source "scripts/gcc-plugins/Kconfig"
|
||||
--- a/arch/x86/Kconfig
|
||||
+++ b/arch/x86/Kconfig
|
||||
@@ -84,6 +84,7 @@ config X86
|
||||
select ARCH_HAS_PMEM_API if X86_64
|
||||
select ARCH_HAS_PTE_DEVMAP if X86_64
|
||||
select ARCH_HAS_PTE_SPECIAL
|
||||
+ select ARCH_HAS_NONLEAF_PMD_YOUNG if X86_64
|
||||
select ARCH_HAS_UACCESS_FLUSHCACHE if X86_64
|
||||
select ARCH_HAS_COPY_MC if X86_64
|
||||
select ARCH_HAS_SET_MEMORY
|
||||
--- a/arch/x86/include/asm/pgtable.h
|
||||
+++ b/arch/x86/include/asm/pgtable.h
|
||||
@@ -817,7 +817,8 @@ static inline unsigned long pmd_page_vad
|
||||
|
||||
static inline int pmd_bad(pmd_t pmd)
|
||||
{
|
||||
- return (pmd_flags(pmd) & ~_PAGE_USER) != _KERNPG_TABLE;
|
||||
+ return (pmd_flags(pmd) & ~(_PAGE_USER | _PAGE_ACCESSED)) !=
|
||||
+ (_KERNPG_TABLE & ~_PAGE_ACCESSED);
|
||||
}
|
||||
|
||||
static inline unsigned long pages_to_mb(unsigned long npg)
|
||||
--- a/arch/x86/mm/pgtable.c
|
||||
+++ b/arch/x86/mm/pgtable.c
|
||||
@@ -550,7 +550,7 @@ int ptep_test_and_clear_young(struct vm_
|
||||
return ret;
|
||||
}
|
||||
|
||||
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
||||
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
|
||||
int pmdp_test_and_clear_young(struct vm_area_struct *vma,
|
||||
unsigned long addr, pmd_t *pmdp)
|
||||
{
|
||||
@@ -562,6 +562,9 @@ int pmdp_test_and_clear_young(struct vm_
|
||||
|
||||
return ret;
|
||||
}
|
||||
+#endif
|
||||
+
|
||||
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
||||
int pudp_test_and_clear_young(struct vm_area_struct *vma,
|
||||
unsigned long addr, pud_t *pudp)
|
||||
{
|
||||
--- a/include/linux/pgtable.h
|
||||
+++ b/include/linux/pgtable.h
|
||||
@@ -212,7 +212,7 @@ static inline int ptep_test_and_clear_yo
|
||||
#endif
|
||||
|
||||
#ifndef __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
|
||||
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
||||
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
|
||||
static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
|
||||
unsigned long address,
|
||||
pmd_t *pmdp)
|
||||
@@ -233,7 +233,7 @@ static inline int pmdp_test_and_clear_yo
|
||||
BUILD_BUG();
|
||||
return 0;
|
||||
}
|
||||
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
|
||||
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG */
|
||||
#endif
|
||||
|
||||
#ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
|
@ -0,0 +1,224 @@
|
||||
From a810f8e2f1bdd0707eaf05c8b4ba84a3ff2801bd Mon Sep 17 00:00:00 2001
|
||||
From: Yu Zhao <yuzhao@google.com>
|
||||
Date: Sun, 27 Sep 2020 20:49:08 -0600
|
||||
Subject: [PATCH 03/10] mm/vmscan.c: refactor shrink_node()
|
||||
|
||||
This patch refactors shrink_node(). This will make the upcoming
|
||||
changes to mm/vmscan.c more readable.
|
||||
|
||||
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
||||
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
|
||||
Change-Id: Iae734b5b4030205b7db6e8c841f747b6f6ae1a04
|
||||
---
|
||||
mm/vmscan.c | 186 +++++++++++++++++++++++++++-------------------------
|
||||
1 file changed, 98 insertions(+), 88 deletions(-)
|
||||
|
||||
--- a/mm/vmscan.c
|
||||
+++ b/mm/vmscan.c
|
||||
@@ -2562,6 +2562,103 @@ enum scan_balance {
|
||||
SCAN_FILE,
|
||||
};
|
||||
|
||||
+static void prepare_scan_count(pg_data_t *pgdat, struct scan_control *sc)
|
||||
+{
|
||||
+ unsigned long file;
|
||||
+ struct lruvec *target_lruvec;
|
||||
+
|
||||
+ target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
|
||||
+
|
||||
+ /*
|
||||
+ * Determine the scan balance between anon and file LRUs.
|
||||
+ */
|
||||
+ spin_lock_irq(&target_lruvec->lru_lock);
|
||||
+ sc->anon_cost = target_lruvec->anon_cost;
|
||||
+ sc->file_cost = target_lruvec->file_cost;
|
||||
+ spin_unlock_irq(&target_lruvec->lru_lock);
|
||||
+
|
||||
+ /*
|
||||
+ * Target desirable inactive:active list ratios for the anon
|
||||
+ * and file LRU lists.
|
||||
+ */
|
||||
+ if (!sc->force_deactivate) {
|
||||
+ unsigned long refaults;
|
||||
+
|
||||
+ refaults = lruvec_page_state(target_lruvec,
|
||||
+ WORKINGSET_ACTIVATE_ANON);
|
||||
+ if (refaults != target_lruvec->refaults[0] ||
|
||||
+ inactive_is_low(target_lruvec, LRU_INACTIVE_ANON))
|
||||
+ sc->may_deactivate |= DEACTIVATE_ANON;
|
||||
+ else
|
||||
+ sc->may_deactivate &= ~DEACTIVATE_ANON;
|
||||
+
|
||||
+ /*
|
||||
+ * When refaults are being observed, it means a new
|
||||
+ * workingset is being established. Deactivate to get
|
||||
+ * rid of any stale active pages quickly.
|
||||
+ */
|
||||
+ refaults = lruvec_page_state(target_lruvec,
|
||||
+ WORKINGSET_ACTIVATE_FILE);
|
||||
+ if (refaults != target_lruvec->refaults[1] ||
|
||||
+ inactive_is_low(target_lruvec, LRU_INACTIVE_FILE))
|
||||
+ sc->may_deactivate |= DEACTIVATE_FILE;
|
||||
+ else
|
||||
+ sc->may_deactivate &= ~DEACTIVATE_FILE;
|
||||
+ } else
|
||||
+ sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE;
|
||||
+
|
||||
+ /*
|
||||
+ * If we have plenty of inactive file pages that aren't
|
||||
+ * thrashing, try to reclaim those first before touching
|
||||
+ * anonymous pages.
|
||||
+ */
|
||||
+ file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE);
|
||||
+ if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE))
|
||||
+ sc->cache_trim_mode = 1;
|
||||
+ else
|
||||
+ sc->cache_trim_mode = 0;
|
||||
+
|
||||
+ /*
|
||||
+ * Prevent the reclaimer from falling into the cache trap: as
|
||||
+ * cache pages start out inactive, every cache fault will tip
|
||||
+ * the scan balance towards the file LRU. And as the file LRU
|
||||
+ * shrinks, so does the window for rotation from references.
|
||||
+ * This means we have a runaway feedback loop where a tiny
|
||||
+ * thrashing file LRU becomes infinitely more attractive than
|
||||
+ * anon pages. Try to detect this based on file LRU size.
|
||||
+ */
|
||||
+ if (!cgroup_reclaim(sc)) {
|
||||
+ unsigned long total_high_wmark = 0;
|
||||
+ unsigned long free, anon;
|
||||
+ int z;
|
||||
+
|
||||
+ free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);
|
||||
+ file = node_page_state(pgdat, NR_ACTIVE_FILE) +
|
||||
+ node_page_state(pgdat, NR_INACTIVE_FILE);
|
||||
+
|
||||
+ for (z = 0; z < MAX_NR_ZONES; z++) {
|
||||
+ struct zone *zone = &pgdat->node_zones[z];
|
||||
+
|
||||
+ if (!managed_zone(zone))
|
||||
+ continue;
|
||||
+
|
||||
+ total_high_wmark += high_wmark_pages(zone);
|
||||
+ }
|
||||
+
|
||||
+ /*
|
||||
+ * Consider anon: if that's low too, this isn't a
|
||||
+ * runaway file reclaim problem, but rather just
|
||||
+ * extreme pressure. Reclaim as per usual then.
|
||||
+ */
|
||||
+ anon = node_page_state(pgdat, NR_INACTIVE_ANON);
|
||||
+
|
||||
+ sc->file_is_tiny =
|
||||
+ file + free <= total_high_wmark &&
|
||||
+ !(sc->may_deactivate & DEACTIVATE_ANON) &&
|
||||
+ anon >> sc->priority;
|
||||
+ }
|
||||
+}
|
||||
+
|
||||
/*
|
||||
* Determine how aggressively the anon and file LRU lists should be
|
||||
* scanned. The relative value of each set of LRU lists is determined
|
||||
@@ -3032,7 +3129,6 @@ static void shrink_node(pg_data_t *pgdat
|
||||
unsigned long nr_reclaimed, nr_scanned;
|
||||
struct lruvec *target_lruvec;
|
||||
bool reclaimable = false;
|
||||
- unsigned long file;
|
||||
|
||||
target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
|
||||
|
||||
@@ -3048,93 +3144,7 @@ again:
|
||||
nr_reclaimed = sc->nr_reclaimed;
|
||||
nr_scanned = sc->nr_scanned;
|
||||
|
||||
- /*
|
||||
- * Determine the scan balance between anon and file LRUs.
|
||||
- */
|
||||
- spin_lock_irq(&target_lruvec->lru_lock);
|
||||
- sc->anon_cost = target_lruvec->anon_cost;
|
||||
- sc->file_cost = target_lruvec->file_cost;
|
||||
- spin_unlock_irq(&target_lruvec->lru_lock);
|
||||
-
|
||||
- /*
|
||||
- * Target desirable inactive:active list ratios for the anon
|
||||
- * and file LRU lists.
|
||||
- */
|
||||
- if (!sc->force_deactivate) {
|
||||
- unsigned long refaults;
|
||||
-
|
||||
- refaults = lruvec_page_state(target_lruvec,
|
||||
- WORKINGSET_ACTIVATE_ANON);
|
||||
- if (refaults != target_lruvec->refaults[0] ||
|
||||
- inactive_is_low(target_lruvec, LRU_INACTIVE_ANON))
|
||||
- sc->may_deactivate |= DEACTIVATE_ANON;
|
||||
- else
|
||||
- sc->may_deactivate &= ~DEACTIVATE_ANON;
|
||||
-
|
||||
- /*
|
||||
- * When refaults are being observed, it means a new
|
||||
- * workingset is being established. Deactivate to get
|
||||
- * rid of any stale active pages quickly.
|
||||
- */
|
||||
- refaults = lruvec_page_state(target_lruvec,
|
||||
- WORKINGSET_ACTIVATE_FILE);
|
||||
- if (refaults != target_lruvec->refaults[1] ||
|
||||
- inactive_is_low(target_lruvec, LRU_INACTIVE_FILE))
|
||||
- sc->may_deactivate |= DEACTIVATE_FILE;
|
||||
- else
|
||||
- sc->may_deactivate &= ~DEACTIVATE_FILE;
|
||||
- } else
|
||||
- sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE;
|
||||
-
|
||||
- /*
|
||||
- * If we have plenty of inactive file pages that aren't
|
||||
- * thrashing, try to reclaim those first before touching
|
||||
- * anonymous pages.
|
||||
- */
|
||||
- file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE);
|
||||
- if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE))
|
||||
- sc->cache_trim_mode = 1;
|
||||
- else
|
||||
- sc->cache_trim_mode = 0;
|
||||
-
|
||||
- /*
|
||||
- * Prevent the reclaimer from falling into the cache trap: as
|
||||
- * cache pages start out inactive, every cache fault will tip
|
||||
- * the scan balance towards the file LRU. And as the file LRU
|
||||
- * shrinks, so does the window for rotation from references.
|
||||
- * This means we have a runaway feedback loop where a tiny
|
||||
- * thrashing file LRU becomes infinitely more attractive than
|
||||
- * anon pages. Try to detect this based on file LRU size.
|
||||
- */
|
||||
- if (!cgroup_reclaim(sc)) {
|
||||
- unsigned long total_high_wmark = 0;
|
||||
- unsigned long free, anon;
|
||||
- int z;
|
||||
-
|
||||
- free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);
|
||||
- file = node_page_state(pgdat, NR_ACTIVE_FILE) +
|
||||
- node_page_state(pgdat, NR_INACTIVE_FILE);
|
||||
-
|
||||
- for (z = 0; z < MAX_NR_ZONES; z++) {
|
||||
- struct zone *zone = &pgdat->node_zones[z];
|
||||
- if (!managed_zone(zone))
|
||||
- continue;
|
||||
-
|
||||
- total_high_wmark += high_wmark_pages(zone);
|
||||
- }
|
||||
-
|
||||
- /*
|
||||
- * Consider anon: if that's low too, this isn't a
|
||||
- * runaway file reclaim problem, but rather just
|
||||
- * extreme pressure. Reclaim as per usual then.
|
||||
- */
|
||||
- anon = node_page_state(pgdat, NR_INACTIVE_ANON);
|
||||
-
|
||||
- sc->file_is_tiny =
|
||||
- file + free <= total_high_wmark &&
|
||||
- !(sc->may_deactivate & DEACTIVATE_ANON) &&
|
||||
- anon >> sc->priority;
|
||||
- }
|
||||
+ prepare_scan_count(pgdat, sc);
|
||||
|
||||
shrink_node_memcgs(pgdat, sc);
|
||||
|
@ -0,0 +1,996 @@
|
||||
From 05f366c941ae2bb8ba21c79fafcb747a5a6b967b Mon Sep 17 00:00:00 2001
|
||||
From: Yu Zhao <yuzhao@google.com>
|
||||
Date: Mon, 25 Jan 2021 21:12:33 -0700
|
||||
Subject: [PATCH 04/10] mm: multigenerational lru: groundwork
|
||||
|
||||
For each lruvec, evictable pages are divided into multiple
|
||||
generations. The youngest generation number is stored in
|
||||
lrugen->max_seq for both anon and file types as they are aged on an
|
||||
equal footing. The oldest generation numbers are stored in
|
||||
lrugen->min_seq[] separately for anon and file types as clean file
|
||||
pages can be evicted regardless of swap constraints. These three
|
||||
variables are monotonically increasing. Generation numbers are
|
||||
truncated into order_base_2(MAX_NR_GENS+1) bits in order to fit into
|
||||
page->flags. The sliding window technique is used to prevent truncated
|
||||
generation numbers from overlapping. Each truncated generation number
|
||||
is an index to
|
||||
lrugen->lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES].
|
||||
|
||||
The framework comprises two conceptually independent components: the
|
||||
aging, which produces young generations, and the eviction, which
|
||||
consumes old generations. Both can be invoked independently from user
|
||||
space for the purpose of working set estimation and proactive reclaim.
|
||||
|
||||
The protection of hot pages and the selection of cold pages are based
|
||||
on page access types and patterns. There are two access types: one via
|
||||
page tables and the other via file descriptors. The protection of the
|
||||
former type is by design stronger because:
|
||||
1) The uncertainty in determining the access patterns of the former
|
||||
type is higher due to the coalesced nature of the accessed bit.
|
||||
2) The cost of evicting the former type is higher due to the TLB
|
||||
flushes required and the likelihood of involving I/O.
|
||||
3) The penalty of under-protecting the former type is higher because
|
||||
applications usually do not prepare themselves for major faults like
|
||||
they do for blocked I/O. For example, client applications commonly
|
||||
dedicate blocked I/O to separate threads to avoid UI janks that
|
||||
negatively affect user experience.
|
||||
|
||||
There are also two access patterns: one with temporal locality and the
|
||||
other without. The latter pattern, e.g., random and sequential, needs
|
||||
to be explicitly excluded to avoid weakening the protection of the
|
||||
former pattern. Generally the former type follows the former pattern
|
||||
unless MADV_SEQUENTIAL is specified and the latter type follows the
|
||||
latter pattern unless outlying refaults have been observed.
|
||||
|
||||
Upon faulting, a page is added to the youngest generation, which
|
||||
provides the strongest protection as the eviction will not consider
|
||||
this page before the aging has scanned it at least twice. The first
|
||||
scan clears the accessed bit set during the initial fault. And the
|
||||
second scan makes sure this page has not been used since the first
|
||||
scan. A page from any other generations is brought back to the
|
||||
youngest generation whenever the aging finds the accessed bit set on
|
||||
any of the PTEs mapping this page.
|
||||
|
||||
Unmapped pages are initially added to the oldest generation and then
|
||||
conditionally protected by tiers. This is done later [PATCH 07/10].
|
||||
|
||||
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
||||
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
|
||||
Change-Id: I71de7cd15b8dfa6f9fdd838023474693c4fee0a7
|
||||
---
|
||||
fs/fuse/dev.c | 3 +-
|
||||
include/linux/cgroup.h | 15 +-
|
||||
include/linux/mm.h | 36 ++++
|
||||
include/linux/mm_inline.h | 182 ++++++++++++++++++++
|
||||
include/linux/mmzone.h | 70 ++++++++
|
||||
include/linux/page-flags-layout.h | 19 ++-
|
||||
include/linux/page-flags.h | 4 +-
|
||||
include/linux/sched.h | 3 +
|
||||
kernel/bounds.c | 3 +
|
||||
kernel/cgroup/cgroup-internal.h | 1 -
|
||||
mm/huge_memory.c | 3 +-
|
||||
mm/memcontrol.c | 1 +
|
||||
mm/memory.c | 7 +
|
||||
mm/mm_init.c | 6 +-
|
||||
mm/page_alloc.c | 1 +
|
||||
mm/swap.c | 9 +-
|
||||
mm/swapfile.c | 2 +
|
||||
mm/vmscan.c | 268 ++++++++++++++++++++++++++++++
|
||||
18 files changed, 618 insertions(+), 15 deletions(-)
|
||||
|
||||
--- a/fs/fuse/dev.c
|
||||
+++ b/fs/fuse/dev.c
|
||||
@@ -785,7 +785,8 @@ static int fuse_check_page(struct page *
|
||||
1 << PG_active |
|
||||
1 << PG_workingset |
|
||||
1 << PG_reclaim |
|
||||
- 1 << PG_waiters))) {
|
||||
+ 1 << PG_waiters |
|
||||
+ LRU_GEN_MASK | LRU_REFS_MASK))) {
|
||||
dump_page(page, "fuse: trying to steal weird page");
|
||||
return 1;
|
||||
}
|
||||
--- a/include/linux/cgroup.h
|
||||
+++ b/include/linux/cgroup.h
|
||||
@@ -432,6 +432,18 @@ static inline void cgroup_put(struct cgr
|
||||
css_put(&cgrp->self);
|
||||
}
|
||||
|
||||
+extern struct mutex cgroup_mutex;
|
||||
+
|
||||
+static inline void cgroup_lock(void)
|
||||
+{
|
||||
+ mutex_lock(&cgroup_mutex);
|
||||
+}
|
||||
+
|
||||
+static inline void cgroup_unlock(void)
|
||||
+{
|
||||
+ mutex_unlock(&cgroup_mutex);
|
||||
+}
|
||||
+
|
||||
/**
|
||||
* task_css_set_check - obtain a task's css_set with extra access conditions
|
||||
* @task: the task to obtain css_set for
|
||||
@@ -446,7 +458,6 @@ static inline void cgroup_put(struct cgr
|
||||
* as locks used during the cgroup_subsys::attach() methods.
|
||||
*/
|
||||
#ifdef CONFIG_PROVE_RCU
|
||||
-extern struct mutex cgroup_mutex;
|
||||
extern spinlock_t css_set_lock;
|
||||
#define task_css_set_check(task, __c) \
|
||||
rcu_dereference_check((task)->cgroups, \
|
||||
@@ -707,6 +718,8 @@ struct cgroup;
|
||||
static inline u64 cgroup_id(const struct cgroup *cgrp) { return 1; }
|
||||
static inline void css_get(struct cgroup_subsys_state *css) {}
|
||||
static inline void css_put(struct cgroup_subsys_state *css) {}
|
||||
+static inline void cgroup_lock(void) {}
|
||||
+static inline void cgroup_unlock(void) {}
|
||||
static inline int cgroup_attach_task_all(struct task_struct *from,
|
||||
struct task_struct *t) { return 0; }
|
||||
static inline int cgroupstats_build(struct cgroupstats *stats,
|
||||
--- a/include/linux/mm.h
|
||||
+++ b/include/linux/mm.h
|
||||
@@ -1093,6 +1093,8 @@ vm_fault_t finish_mkwrite_fault(struct v
|
||||
#define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH)
|
||||
#define LAST_CPUPID_PGOFF (ZONES_PGOFF - LAST_CPUPID_WIDTH)
|
||||
#define KASAN_TAG_PGOFF (LAST_CPUPID_PGOFF - KASAN_TAG_WIDTH)
|
||||
+#define LRU_GEN_PGOFF (KASAN_TAG_PGOFF - LRU_GEN_WIDTH)
|
||||
+#define LRU_REFS_PGOFF (LRU_GEN_PGOFF - LRU_REFS_WIDTH)
|
||||
|
||||
/*
|
||||
* Define the bit shifts to access each section. For non-existent
|
||||
@@ -1807,6 +1809,40 @@ static inline void unmap_mapping_range(s
|
||||
loff_t const holebegin, loff_t const holelen, int even_cows) { }
|
||||
#endif
|
||||
|
||||
+#ifdef CONFIG_LRU_GEN
|
||||
+static inline void task_enter_nonseq_fault(void)
|
||||
+{
|
||||
+ WARN_ON(current->in_nonseq_fault);
|
||||
+
|
||||
+ current->in_nonseq_fault = 1;
|
||||
+}
|
||||
+
|
||||
+static inline void task_exit_nonseq_fault(void)
|
||||
+{
|
||||
+ WARN_ON(!current->in_nonseq_fault);
|
||||
+
|
||||
+ current->in_nonseq_fault = 0;
|
||||
+}
|
||||
+
|
||||
+static inline bool task_in_nonseq_fault(void)
|
||||
+{
|
||||
+ return current->in_nonseq_fault;
|
||||
+}
|
||||
+#else
|
||||
+static inline void task_enter_nonseq_fault(void)
|
||||
+{
|
||||
+}
|
||||
+
|
||||
+static inline void task_exit_nonseq_fault(void)
|
||||
+{
|
||||
+}
|
||||
+
|
||||
+static inline bool task_in_nonseq_fault(void)
|
||||
+{
|
||||
+ return false;
|
||||
+}
|
||||
+#endif /* CONFIG_LRU_GEN */
|
||||
+
|
||||
static inline void unmap_shared_mapping_range(struct address_space *mapping,
|
||||
loff_t const holebegin, loff_t const holelen)
|
||||
{
|
||||
--- a/include/linux/mm_inline.h
|
||||
+++ b/include/linux/mm_inline.h
|
||||
@@ -79,11 +79,187 @@ static __always_inline enum lru_list pag
|
||||
return lru;
|
||||
}
|
||||
|
||||
+#ifdef CONFIG_LRU_GEN
|
||||
+
|
||||
+static inline bool lru_gen_enabled(void)
|
||||
+{
|
||||
+#ifdef CONFIG_LRU_GEN_ENABLED
|
||||
+ DECLARE_STATIC_KEY_TRUE(lru_gen_static_key);
|
||||
+
|
||||
+ return static_branch_likely(&lru_gen_static_key);
|
||||
+#else
|
||||
+ DECLARE_STATIC_KEY_FALSE(lru_gen_static_key);
|
||||
+
|
||||
+ return static_branch_unlikely(&lru_gen_static_key);
|
||||
+#endif
|
||||
+}
|
||||
+
|
||||
+/* Return an index within the sliding window that tracks MAX_NR_GENS generations. */
|
||||
+static inline int lru_gen_from_seq(unsigned long seq)
|
||||
+{
|
||||
+ return seq % MAX_NR_GENS;
|
||||
+}
|
||||
+
|
||||
+/* The youngest and the second youngest generations are counted as active. */
|
||||
+static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen)
|
||||
+{
|
||||
+ unsigned long max_seq = lruvec->evictable.max_seq;
|
||||
+
|
||||
+ VM_BUG_ON(gen >= MAX_NR_GENS);
|
||||
+
|
||||
+ return gen == lru_gen_from_seq(max_seq) || gen == lru_gen_from_seq(max_seq - 1);
|
||||
+}
|
||||
+
|
||||
+/* Update the sizes of the multigenerational lru lists. */
|
||||
+static inline void lru_gen_update_size(struct page *page, struct lruvec *lruvec,
|
||||
+ int old_gen, int new_gen)
|
||||
+{
|
||||
+ int type = page_is_file_lru(page);
|
||||
+ int zone = page_zonenum(page);
|
||||
+ int delta = thp_nr_pages(page);
|
||||
+ enum lru_list lru = type * LRU_FILE;
|
||||
+ struct lrugen *lrugen = &lruvec->evictable;
|
||||
+
|
||||
+ lockdep_assert_held(&lruvec->lru_lock);
|
||||
+ VM_BUG_ON(old_gen != -1 && old_gen >= MAX_NR_GENS);
|
||||
+ VM_BUG_ON(new_gen != -1 && new_gen >= MAX_NR_GENS);
|
||||
+ VM_BUG_ON(old_gen == -1 && new_gen == -1);
|
||||
+
|
||||
+ if (old_gen >= 0)
|
||||
+ WRITE_ONCE(lrugen->sizes[old_gen][type][zone],
|
||||
+ lrugen->sizes[old_gen][type][zone] - delta);
|
||||
+ if (new_gen >= 0)
|
||||
+ WRITE_ONCE(lrugen->sizes[new_gen][type][zone],
|
||||
+ lrugen->sizes[new_gen][type][zone] + delta);
|
||||
+
|
||||
+ if (old_gen < 0) {
|
||||
+ if (lru_gen_is_active(lruvec, new_gen))
|
||||
+ lru += LRU_ACTIVE;
|
||||
+ update_lru_size(lruvec, lru, zone, delta);
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
+ if (new_gen < 0) {
|
||||
+ if (lru_gen_is_active(lruvec, old_gen))
|
||||
+ lru += LRU_ACTIVE;
|
||||
+ update_lru_size(lruvec, lru, zone, -delta);
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
+ if (!lru_gen_is_active(lruvec, old_gen) && lru_gen_is_active(lruvec, new_gen)) {
|
||||
+ update_lru_size(lruvec, lru, zone, -delta);
|
||||
+ update_lru_size(lruvec, lru + LRU_ACTIVE, zone, delta);
|
||||
+ }
|
||||
+
|
||||
+ VM_BUG_ON(lru_gen_is_active(lruvec, old_gen) && !lru_gen_is_active(lruvec, new_gen));
|
||||
+}
|
||||
+
|
||||
+/* Add a page to one of the multigenerational lru lists. Return true on success. */
|
||||
+static inline bool lru_gen_add_page(struct page *page, struct lruvec *lruvec, bool reclaiming)
|
||||
+{
|
||||
+ int gen;
|
||||
+ unsigned long old_flags, new_flags;
|
||||
+ int type = page_is_file_lru(page);
|
||||
+ int zone = page_zonenum(page);
|
||||
+ struct lrugen *lrugen = &lruvec->evictable;
|
||||
+
|
||||
+ if (PageUnevictable(page) || !lrugen->enabled[type])
|
||||
+ return false;
|
||||
+ /*
|
||||
+ * If a page shouldn't be considered for eviction, i.e., a page mapped
|
||||
+ * upon fault during which the accessed bit is set, add it to the
|
||||
+ * youngest generation.
|
||||
+ *
|
||||
+ * If a page can't be evicted immediately, i.e., an anon page not in
|
||||
+ * swap cache or a dirty page pending writeback, add it to the second
|
||||
+ * oldest generation.
|
||||
+ *
|
||||
+ * If a page could be evicted immediately, e.g., a clean page, add it to
|
||||
+ * the oldest generation.
|
||||
+ */
|
||||
+ if (PageActive(page))
|
||||
+ gen = lru_gen_from_seq(lrugen->max_seq);
|
||||
+ else if ((!type && !PageSwapCache(page)) ||
|
||||
+ (PageReclaim(page) && (PageDirty(page) || PageWriteback(page))))
|
||||
+ gen = lru_gen_from_seq(lrugen->min_seq[type] + 1);
|
||||
+ else
|
||||
+ gen = lru_gen_from_seq(lrugen->min_seq[type]);
|
||||
+
|
||||
+ do {
|
||||
+ new_flags = old_flags = READ_ONCE(page->flags);
|
||||
+ VM_BUG_ON_PAGE(new_flags & LRU_GEN_MASK, page);
|
||||
+
|
||||
+ new_flags &= ~(LRU_GEN_MASK | BIT(PG_active));
|
||||
+ new_flags |= (gen + 1UL) << LRU_GEN_PGOFF;
|
||||
+ } while (cmpxchg(&page->flags, old_flags, new_flags) != old_flags);
|
||||
+
|
||||
+ lru_gen_update_size(page, lruvec, -1, gen);
|
||||
+ /* for rotate_reclaimable_page() */
|
||||
+ if (reclaiming)
|
||||
+ list_add_tail(&page->lru, &lrugen->lists[gen][type][zone]);
|
||||
+ else
|
||||
+ list_add(&page->lru, &lrugen->lists[gen][type][zone]);
|
||||
+
|
||||
+ return true;
|
||||
+}
|
||||
+
|
||||
+/* Delete a page from one of the multigenerational lru lists. Return true on success. */
|
||||
+static inline bool lru_gen_del_page(struct page *page, struct lruvec *lruvec, bool reclaiming)
|
||||
+{
|
||||
+ int gen;
|
||||
+ unsigned long old_flags, new_flags;
|
||||
+
|
||||
+ do {
|
||||
+ new_flags = old_flags = READ_ONCE(page->flags);
|
||||
+ if (!(new_flags & LRU_GEN_MASK))
|
||||
+ return false;
|
||||
+
|
||||
+ VM_BUG_ON_PAGE(PageActive(page), page);
|
||||
+ VM_BUG_ON_PAGE(PageUnevictable(page), page);
|
||||
+
|
||||
+ gen = ((new_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
|
||||
+
|
||||
+ new_flags &= ~LRU_GEN_MASK;
|
||||
+ /* for shrink_page_list() */
|
||||
+ if (reclaiming)
|
||||
+ new_flags &= ~(BIT(PG_referenced) | BIT(PG_reclaim));
|
||||
+ else if (lru_gen_is_active(lruvec, gen))
|
||||
+ new_flags |= BIT(PG_active);
|
||||
+ } while (cmpxchg(&page->flags, old_flags, new_flags) != old_flags);
|
||||
+
|
||||
+ lru_gen_update_size(page, lruvec, gen, -1);
|
||||
+ list_del(&page->lru);
|
||||
+
|
||||
+ return true;
|
||||
+}
|
||||
+
|
||||
+#else
|
||||
+
|
||||
+static inline bool lru_gen_enabled(void)
|
||||
+{
|
||||
+ return false;
|
||||
+}
|
||||
+
|
||||
+static inline bool lru_gen_add_page(struct page *page, struct lruvec *lruvec, bool reclaiming)
|
||||
+{
|
||||
+ return false;
|
||||
+}
|
||||
+
|
||||
+static inline bool lru_gen_del_page(struct page *page, struct lruvec *lruvec, bool reclaiming)
|
||||
+{
|
||||
+ return false;
|
||||
+}
|
||||
+
|
||||
+#endif /* CONFIG_LRU_GEN */
|
||||
+
|
||||
static __always_inline void add_page_to_lru_list(struct page *page,
|
||||
struct lruvec *lruvec)
|
||||
{
|
||||
enum lru_list lru = page_lru(page);
|
||||
|
||||
+ if (lru_gen_add_page(page, lruvec, false))
|
||||
+ return;
|
||||
+
|
||||
update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page));
|
||||
list_add(&page->lru, &lruvec->lists[lru]);
|
||||
}
|
||||
@@ -93,6 +269,9 @@ static __always_inline void add_page_to_
|
||||
{
|
||||
enum lru_list lru = page_lru(page);
|
||||
|
||||
+ if (lru_gen_add_page(page, lruvec, true))
|
||||
+ return;
|
||||
+
|
||||
update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page));
|
||||
list_add_tail(&page->lru, &lruvec->lists[lru]);
|
||||
}
|
||||
@@ -100,6 +279,9 @@ static __always_inline void add_page_to_
|
||||
static __always_inline void del_page_from_lru_list(struct page *page,
|
||||
struct lruvec *lruvec)
|
||||
{
|
||||
+ if (lru_gen_del_page(page, lruvec, false))
|
||||
+ return;
|
||||
+
|
||||
list_del(&page->lru);
|
||||
update_lru_size(lruvec, page_lru(page), page_zonenum(page),
|
||||
-thp_nr_pages(page));
|
||||
--- a/include/linux/mmzone.h
|
||||
+++ b/include/linux/mmzone.h
|
||||
@@ -294,6 +294,72 @@ enum lruvec_flags {
|
||||
*/
|
||||
};
|
||||
|
||||
+struct lruvec;
|
||||
+
|
||||
+#define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF)
|
||||
+#define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF)
|
||||
+
|
||||
+#ifdef CONFIG_LRU_GEN
|
||||
+
|
||||
+/*
|
||||
+ * For each lruvec, evictable pages are divided into multiple generations. The
|
||||
+ * youngest and the oldest generation numbers, AKA max_seq and min_seq, are
|
||||
+ * monotonically increasing. The sliding window technique is used to track at
|
||||
+ * least MIN_NR_GENS and at most MAX_NR_GENS generations. An offset within the
|
||||
+ * window, AKA gen, indexes an array of per-type and per-zone lists for the
|
||||
+ * corresponding generation. The counter in page->flags stores gen+1 while a
|
||||
+ * page is on one of the multigenerational lru lists. Otherwise, it stores 0.
|
||||
+ *
|
||||
+ * After a page is faulted in, the aging must check the accessed bit at least
|
||||
+ * twice before the eviction would consider it. The first check clears the
|
||||
+ * accessed bit set during the initial fault. The second check makes sure this
|
||||
+ * page hasn't been used since then.
|
||||
+ */
|
||||
+#define MIN_NR_GENS 2
|
||||
+#define MAX_NR_GENS ((unsigned int)CONFIG_NR_LRU_GENS)
|
||||
+
|
||||
+struct lrugen {
|
||||
+ /* the aging increments the max generation number */
|
||||
+ unsigned long max_seq;
|
||||
+ /* the eviction increments the min generation numbers */
|
||||
+ unsigned long min_seq[ANON_AND_FILE];
|
||||
+ /* the birth time of each generation in jiffies */
|
||||
+ unsigned long timestamps[MAX_NR_GENS];
|
||||
+ /* the multigenerational lru lists */
|
||||
+ struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
|
||||
+ /* the sizes of the multigenerational lru lists in pages */
|
||||
+ unsigned long sizes[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
|
||||
+ /* whether the multigenerational lru is enabled */
|
||||
+ bool enabled[ANON_AND_FILE];
|
||||
+};
|
||||
+
|
||||
+#define MAX_BATCH_SIZE 8192
|
||||
+
|
||||
+void lru_gen_init_state(struct mem_cgroup *memcg, struct lruvec *lruvec);
|
||||
+void lru_gen_change_state(bool enable, bool main, bool swap);
|
||||
+
|
||||
+#ifdef CONFIG_MEMCG
|
||||
+void lru_gen_init_memcg(struct mem_cgroup *memcg);
|
||||
+#endif
|
||||
+
|
||||
+#else /* !CONFIG_LRU_GEN */
|
||||
+
|
||||
+static inline void lru_gen_init_state(struct mem_cgroup *memcg, struct lruvec *lruvec)
|
||||
+{
|
||||
+}
|
||||
+
|
||||
+static inline void lru_gen_change_state(bool enable, bool main, bool swap)
|
||||
+{
|
||||
+}
|
||||
+
|
||||
+#ifdef CONFIG_MEMCG
|
||||
+static inline void lru_gen_init_memcg(struct mem_cgroup *memcg)
|
||||
+{
|
||||
+}
|
||||
+#endif
|
||||
+
|
||||
+#endif /* CONFIG_LRU_GEN */
|
||||
+
|
||||
struct lruvec {
|
||||
struct list_head lists[NR_LRU_LISTS];
|
||||
/* per lruvec lru_lock for memcg */
|
||||
@@ -311,6 +377,10 @@ struct lruvec {
|
||||
unsigned long refaults[ANON_AND_FILE];
|
||||
/* Various lruvec state flags (enum lruvec_flags) */
|
||||
unsigned long flags;
|
||||
+#ifdef CONFIG_LRU_GEN
|
||||
+ /* unevictable pages are on LRU_UNEVICTABLE */
|
||||
+ struct lrugen evictable;
|
||||
+#endif
|
||||
#ifdef CONFIG_MEMCG
|
||||
struct pglist_data *pgdat;
|
||||
#endif
|
||||
--- a/include/linux/page-flags-layout.h
|
||||
+++ b/include/linux/page-flags-layout.h
|
||||
@@ -26,6 +26,14 @@
|
||||
|
||||
#define ZONES_WIDTH ZONES_SHIFT
|
||||
|
||||
+#ifdef CONFIG_LRU_GEN
|
||||
+/* LRU_GEN_WIDTH is generated from order_base_2(CONFIG_NR_LRU_GENS + 1). */
|
||||
+#define LRU_REFS_WIDTH (CONFIG_TIERS_PER_GEN - 2)
|
||||
+#else
|
||||
+#define LRU_GEN_WIDTH 0
|
||||
+#define LRU_REFS_WIDTH 0
|
||||
+#endif /* CONFIG_LRU_GEN */
|
||||
+
|
||||
#ifdef CONFIG_SPARSEMEM
|
||||
#include <asm/sparsemem.h>
|
||||
#define SECTIONS_SHIFT (MAX_PHYSMEM_BITS - SECTION_SIZE_BITS)
|
||||
@@ -55,7 +63,8 @@
|
||||
#define SECTIONS_WIDTH 0
|
||||
#endif
|
||||
|
||||
-#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS
|
||||
+#if ZONES_WIDTH + LRU_GEN_WIDTH + LRU_REFS_WIDTH + SECTIONS_WIDTH + NODES_SHIFT \
|
||||
+ <= BITS_PER_LONG - NR_PAGEFLAGS
|
||||
#define NODES_WIDTH NODES_SHIFT
|
||||
#elif defined(CONFIG_SPARSEMEM_VMEMMAP)
|
||||
#error "Vmemmap: No space for nodes field in page flags"
|
||||
@@ -89,8 +98,8 @@
|
||||
#define LAST_CPUPID_SHIFT 0
|
||||
#endif
|
||||
|
||||
-#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + KASAN_TAG_WIDTH + LAST_CPUPID_SHIFT \
|
||||
- <= BITS_PER_LONG - NR_PAGEFLAGS
|
||||
+#if ZONES_WIDTH + LRU_GEN_WIDTH + LRU_REFS_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + \
|
||||
+ KASAN_TAG_WIDTH + LAST_CPUPID_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS
|
||||
#define LAST_CPUPID_WIDTH LAST_CPUPID_SHIFT
|
||||
#else
|
||||
#define LAST_CPUPID_WIDTH 0
|
||||
@@ -100,8 +109,8 @@
|
||||
#define LAST_CPUPID_NOT_IN_PAGE_FLAGS
|
||||
#endif
|
||||
|
||||
-#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + KASAN_TAG_WIDTH + LAST_CPUPID_WIDTH \
|
||||
- > BITS_PER_LONG - NR_PAGEFLAGS
|
||||
+#if ZONES_WIDTH + LRU_GEN_WIDTH + LRU_REFS_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + \
|
||||
+ KASAN_TAG_WIDTH + LAST_CPUPID_WIDTH > BITS_PER_LONG - NR_PAGEFLAGS
|
||||
#error "Not enough bits in page flags"
|
||||
#endif
|
||||
|
||||
--- a/include/linux/page-flags.h
|
||||
+++ b/include/linux/page-flags.h
|
||||
@@ -845,7 +845,7 @@ static inline void ClearPageSlabPfmemall
|
||||
1UL << PG_private | 1UL << PG_private_2 | \
|
||||
1UL << PG_writeback | 1UL << PG_reserved | \
|
||||
1UL << PG_slab | 1UL << PG_active | \
|
||||
- 1UL << PG_unevictable | __PG_MLOCKED)
|
||||
+ 1UL << PG_unevictable | __PG_MLOCKED | LRU_GEN_MASK)
|
||||
|
||||
/*
|
||||
* Flags checked when a page is prepped for return by the page allocator.
|
||||
@@ -856,7 +856,7 @@ static inline void ClearPageSlabPfmemall
|
||||
* alloc-free cycle to prevent from reusing the page.
|
||||
*/
|
||||
#define PAGE_FLAGS_CHECK_AT_PREP \
|
||||
- (PAGEFLAGS_MASK & ~__PG_HWPOISON)
|
||||
+ ((PAGEFLAGS_MASK & ~__PG_HWPOISON) | LRU_GEN_MASK | LRU_REFS_MASK)
|
||||
|
||||
#define PAGE_FLAGS_PRIVATE \
|
||||
(1UL << PG_private | 1UL << PG_private_2)
|
||||
--- a/include/linux/sched.h
|
||||
+++ b/include/linux/sched.h
|
||||
@@ -911,6 +911,9 @@ struct task_struct {
|
||||
#ifdef CONFIG_MEMCG
|
||||
unsigned in_user_fault:1;
|
||||
#endif
|
||||
+#ifdef CONFIG_LRU_GEN
|
||||
+ unsigned in_nonseq_fault:1;
|
||||
+#endif
|
||||
#ifdef CONFIG_COMPAT_BRK
|
||||
unsigned brk_randomized:1;
|
||||
#endif
|
||||
--- a/kernel/bounds.c
|
||||
+++ b/kernel/bounds.c
|
||||
@@ -22,6 +22,9 @@ int main(void)
|
||||
DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS));
|
||||
#endif
|
||||
DEFINE(SPINLOCK_SIZE, sizeof(spinlock_t));
|
||||
+#ifdef CONFIG_LRU_GEN
|
||||
+ DEFINE(LRU_GEN_WIDTH, order_base_2(CONFIG_NR_LRU_GENS + 1));
|
||||
+#endif
|
||||
/* End of constants */
|
||||
|
||||
return 0;
|
||||
--- a/kernel/cgroup/cgroup-internal.h
|
||||
+++ b/kernel/cgroup/cgroup-internal.h
|
||||
@@ -165,7 +165,6 @@ struct cgroup_mgctx {
|
||||
#define DEFINE_CGROUP_MGCTX(name) \
|
||||
struct cgroup_mgctx name = CGROUP_MGCTX_INIT(name)
|
||||
|
||||
-extern struct mutex cgroup_mutex;
|
||||
extern spinlock_t css_set_lock;
|
||||
extern struct cgroup_subsys *cgroup_subsys[];
|
||||
extern struct list_head cgroup_roots;
|
||||
--- a/mm/huge_memory.c
|
||||
+++ b/mm/huge_memory.c
|
||||
@@ -2364,7 +2364,8 @@ static void __split_huge_page_tail(struc
|
||||
#ifdef CONFIG_64BIT
|
||||
(1L << PG_arch_2) |
|
||||
#endif
|
||||
- (1L << PG_dirty)));
|
||||
+ (1L << PG_dirty) |
|
||||
+ LRU_GEN_MASK | LRU_REFS_MASK));
|
||||
|
||||
/* ->mapping in first tail page is compound_mapcount */
|
||||
VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
|
||||
--- a/mm/memcontrol.c
|
||||
+++ b/mm/memcontrol.c
|
||||
@@ -5226,6 +5226,7 @@ static struct mem_cgroup *mem_cgroup_all
|
||||
memcg->deferred_split_queue.split_queue_len = 0;
|
||||
#endif
|
||||
idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
|
||||
+ lru_gen_init_memcg(memcg);
|
||||
return memcg;
|
||||
fail:
|
||||
mem_cgroup_id_remove(memcg);
|
||||
--- a/mm/memory.c
|
||||
+++ b/mm/memory.c
|
||||
@@ -4788,6 +4788,7 @@ vm_fault_t handle_mm_fault(struct vm_are
|
||||
unsigned int flags, struct pt_regs *regs)
|
||||
{
|
||||
vm_fault_t ret;
|
||||
+ bool nonseq_fault = !(vma->vm_flags & VM_SEQ_READ);
|
||||
|
||||
__set_current_state(TASK_RUNNING);
|
||||
|
||||
@@ -4809,11 +4810,17 @@ vm_fault_t handle_mm_fault(struct vm_are
|
||||
if (flags & FAULT_FLAG_USER)
|
||||
mem_cgroup_enter_user_fault();
|
||||
|
||||
+ if (nonseq_fault)
|
||||
+ task_enter_nonseq_fault();
|
||||
+
|
||||
if (unlikely(is_vm_hugetlb_page(vma)))
|
||||
ret = hugetlb_fault(vma->vm_mm, vma, address, flags);
|
||||
else
|
||||
ret = __handle_mm_fault(vma, address, flags);
|
||||
|
||||
+ if (nonseq_fault)
|
||||
+ task_exit_nonseq_fault();
|
||||
+
|
||||
if (flags & FAULT_FLAG_USER) {
|
||||
mem_cgroup_exit_user_fault();
|
||||
/*
|
||||
--- a/mm/mm_init.c
|
||||
+++ b/mm/mm_init.c
|
||||
@@ -65,14 +65,16 @@ void __init mminit_verify_pageflags_layo
|
||||
|
||||
shift = 8 * sizeof(unsigned long);
|
||||
width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH
|
||||
- - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH;
|
||||
+ - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH - LRU_GEN_WIDTH - LRU_REFS_WIDTH;
|
||||
mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths",
|
||||
- "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Flags %d\n",
|
||||
+ "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Gen %d Tier %d Flags %d\n",
|
||||
SECTIONS_WIDTH,
|
||||
NODES_WIDTH,
|
||||
ZONES_WIDTH,
|
||||
LAST_CPUPID_WIDTH,
|
||||
KASAN_TAG_WIDTH,
|
||||
+ LRU_GEN_WIDTH,
|
||||
+ LRU_REFS_WIDTH,
|
||||
NR_PAGEFLAGS);
|
||||
mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts",
|
||||
"Section %d Node %d Zone %d Lastcpupid %d Kasantag %d\n",
|
||||
--- a/mm/page_alloc.c
|
||||
+++ b/mm/page_alloc.c
|
||||
@@ -7411,6 +7411,7 @@ static void __meminit pgdat_init_interna
|
||||
|
||||
pgdat_page_ext_init(pgdat);
|
||||
lruvec_init(&pgdat->__lruvec);
|
||||
+ lru_gen_init_state(NULL, &pgdat->__lruvec);
|
||||
}
|
||||
|
||||
static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, int nid,
|
||||
--- a/mm/swap.c
|
||||
+++ b/mm/swap.c
|
||||
@@ -446,6 +446,11 @@ void lru_cache_add(struct page *page)
|
||||
VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page);
|
||||
VM_BUG_ON_PAGE(PageLRU(page), page);
|
||||
|
||||
+ /* see the comment in lru_gen_add_page() */
|
||||
+ if (lru_gen_enabled() && !PageUnevictable(page) &&
|
||||
+ task_in_nonseq_fault() && !(current->flags & PF_MEMALLOC))
|
||||
+ SetPageActive(page);
|
||||
+
|
||||
get_page(page);
|
||||
local_lock(&lru_pvecs.lock);
|
||||
pvec = this_cpu_ptr(&lru_pvecs.lru_add);
|
||||
@@ -547,7 +552,7 @@ static void lru_deactivate_file_fn(struc
|
||||
|
||||
static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec)
|
||||
{
|
||||
- if (PageActive(page) && !PageUnevictable(page)) {
|
||||
+ if (!PageUnevictable(page) && (PageActive(page) || lru_gen_enabled())) {
|
||||
int nr_pages = thp_nr_pages(page);
|
||||
|
||||
del_page_from_lru_list(page, lruvec);
|
||||
@@ -661,7 +666,7 @@ void deactivate_file_page(struct page *p
|
||||
*/
|
||||
void deactivate_page(struct page *page)
|
||||
{
|
||||
- if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
|
||||
+ if (PageLRU(page) && !PageUnevictable(page) && (PageActive(page) || lru_gen_enabled())) {
|
||||
struct pagevec *pvec;
|
||||
|
||||
local_lock(&lru_pvecs.lock);
|
||||
--- a/mm/swapfile.c
|
||||
+++ b/mm/swapfile.c
|
||||
@@ -2688,6 +2688,7 @@ SYSCALL_DEFINE1(swapoff, const char __us
|
||||
err = 0;
|
||||
atomic_inc(&proc_poll_event);
|
||||
wake_up_interruptible(&proc_poll_wait);
|
||||
+ lru_gen_change_state(false, false, true);
|
||||
|
||||
out_dput:
|
||||
filp_close(victim, NULL);
|
||||
@@ -3349,6 +3350,7 @@ SYSCALL_DEFINE2(swapon, const char __use
|
||||
mutex_unlock(&swapon_mutex);
|
||||
atomic_inc(&proc_poll_event);
|
||||
wake_up_interruptible(&proc_poll_wait);
|
||||
+ lru_gen_change_state(true, false, true);
|
||||
|
||||
error = 0;
|
||||
goto out;
|
||||
--- a/mm/vmscan.c
|
||||
+++ b/mm/vmscan.c
|
||||
@@ -50,6 +50,7 @@
|
||||
#include <linux/printk.h>
|
||||
#include <linux/dax.h>
|
||||
#include <linux/psi.h>
|
||||
+#include <linux/memory.h>
|
||||
|
||||
#include <asm/tlbflush.h>
|
||||
#include <asm/div64.h>
|
||||
@@ -2880,6 +2881,273 @@ static bool can_age_anon_pages(struct pg
|
||||
return can_demote(pgdat->node_id, sc);
|
||||
}
|
||||
|
||||
+#ifdef CONFIG_LRU_GEN
|
||||
+
|
||||
+/******************************************************************************
|
||||
+ * shorthand helpers
|
||||
+ ******************************************************************************/
|
||||
+
|
||||
+#define for_each_gen_type_zone(gen, type, zone) \
|
||||
+ for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++) \
|
||||
+ for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \
|
||||
+ for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++)
|
||||
+
|
||||
+static int page_lru_gen(struct page *page)
|
||||
+{
|
||||
+ unsigned long flags = READ_ONCE(page->flags);
|
||||
+
|
||||
+ return ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
|
||||
+}
|
||||
+
|
||||
+static struct lruvec *get_lruvec(int nid, struct mem_cgroup *memcg)
|
||||
+{
|
||||
+ struct pglist_data *pgdat = NODE_DATA(nid);
|
||||
+
|
||||
+#ifdef CONFIG_MEMCG
|
||||
+ if (memcg) {
|
||||
+ struct lruvec *lruvec = &memcg->nodeinfo[nid]->lruvec;
|
||||
+
|
||||
+ if (lruvec->pgdat != pgdat)
|
||||
+ lruvec->pgdat = pgdat;
|
||||
+
|
||||
+ return lruvec;
|
||||
+ }
|
||||
+#endif
|
||||
+ return pgdat ? &pgdat->__lruvec : NULL;
|
||||
+}
|
||||
+
|
||||
+static int get_nr_gens(struct lruvec *lruvec, int type)
|
||||
+{
|
||||
+ return lruvec->evictable.max_seq - lruvec->evictable.min_seq[type] + 1;
|
||||
+}
|
||||
+
|
||||
+static bool __maybe_unused seq_is_valid(struct lruvec *lruvec)
|
||||
+{
|
||||
+ return get_nr_gens(lruvec, 1) >= MIN_NR_GENS &&
|
||||
+ get_nr_gens(lruvec, 1) <= get_nr_gens(lruvec, 0) &&
|
||||
+ get_nr_gens(lruvec, 0) <= MAX_NR_GENS;
|
||||
+}
|
||||
+
|
||||
+/******************************************************************************
|
||||
+ * state change
|
||||
+ ******************************************************************************/
|
||||
+
|
||||
+#ifdef CONFIG_LRU_GEN_ENABLED
|
||||
+DEFINE_STATIC_KEY_TRUE(lru_gen_static_key);
|
||||
+#else
|
||||
+DEFINE_STATIC_KEY_FALSE(lru_gen_static_key);
|
||||
+#endif
|
||||
+
|
||||
+static int lru_gen_nr_swapfiles;
|
||||
+
|
||||
+static bool __maybe_unused state_is_valid(struct lruvec *lruvec)
|
||||
+{
|
||||
+ int gen, type, zone;
|
||||
+ enum lru_list lru;
|
||||
+ struct lrugen *lrugen = &lruvec->evictable;
|
||||
+
|
||||
+ for_each_evictable_lru(lru) {
|
||||
+ type = is_file_lru(lru);
|
||||
+
|
||||
+ if (lrugen->enabled[type] && !list_empty(&lruvec->lists[lru]))
|
||||
+ return false;
|
||||
+ }
|
||||
+
|
||||
+ for_each_gen_type_zone(gen, type, zone) {
|
||||
+ if (!lrugen->enabled[type] && !list_empty(&lrugen->lists[gen][type][zone]))
|
||||
+ return false;
|
||||
+
|
||||
+ /* unlikely but not a bug when reset_batch_size() is pending */
|
||||
+ VM_WARN_ON(!lrugen->enabled[type] && lrugen->sizes[gen][type][zone]);
|
||||
+ }
|
||||
+
|
||||
+ return true;
|
||||
+}
|
||||
+
|
||||
+static bool fill_lists(struct lruvec *lruvec)
|
||||
+{
|
||||
+ enum lru_list lru;
|
||||
+ int remaining = MAX_BATCH_SIZE;
|
||||
+
|
||||
+ for_each_evictable_lru(lru) {
|
||||
+ int type = is_file_lru(lru);
|
||||
+ bool active = is_active_lru(lru);
|
||||
+ struct list_head *head = &lruvec->lists[lru];
|
||||
+
|
||||
+ if (!lruvec->evictable.enabled[type])
|
||||
+ continue;
|
||||
+
|
||||
+ while (!list_empty(head)) {
|
||||
+ bool success;
|
||||
+ struct page *page = lru_to_page(head);
|
||||
+
|
||||
+ VM_BUG_ON_PAGE(PageTail(page), page);
|
||||
+ VM_BUG_ON_PAGE(PageUnevictable(page), page);
|
||||
+ VM_BUG_ON_PAGE(PageActive(page) != active, page);
|
||||
+ VM_BUG_ON_PAGE(page_is_file_lru(page) != type, page);
|
||||
+ VM_BUG_ON_PAGE(page_lru_gen(page) < MAX_NR_GENS, page);
|
||||
+
|
||||
+ prefetchw_prev_lru_page(page, head, flags);
|
||||
+
|
||||
+ del_page_from_lru_list(page, lruvec);
|
||||
+ success = lru_gen_add_page(page, lruvec, false);
|
||||
+ VM_BUG_ON(!success);
|
||||
+
|
||||
+ if (!--remaining)
|
||||
+ return false;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ return true;
|
||||
+}
|
||||
+
|
||||
+static bool drain_lists(struct lruvec *lruvec)
|
||||
+{
|
||||
+ int gen, type, zone;
|
||||
+ int remaining = MAX_BATCH_SIZE;
|
||||
+
|
||||
+ for_each_gen_type_zone(gen, type, zone) {
|
||||
+ struct list_head *head = &lruvec->evictable.lists[gen][type][zone];
|
||||
+
|
||||
+ if (lruvec->evictable.enabled[type])
|
||||
+ continue;
|
||||
+
|
||||
+ while (!list_empty(head)) {
|
||||
+ bool success;
|
||||
+ struct page *page = lru_to_page(head);
|
||||
+
|
||||
+ VM_BUG_ON_PAGE(PageTail(page), page);
|
||||
+ VM_BUG_ON_PAGE(PageUnevictable(page), page);
|
||||
+ VM_BUG_ON_PAGE(PageActive(page), page);
|
||||
+ VM_BUG_ON_PAGE(page_is_file_lru(page) != type, page);
|
||||
+ VM_BUG_ON_PAGE(page_zonenum(page) != zone, page);
|
||||
+
|
||||
+ prefetchw_prev_lru_page(page, head, flags);
|
||||
+
|
||||
+ success = lru_gen_del_page(page, lruvec, false);
|
||||
+ VM_BUG_ON(!success);
|
||||
+ add_page_to_lru_list(page, lruvec);
|
||||
+
|
||||
+ if (!--remaining)
|
||||
+ return false;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ return true;
|
||||
+}
|
||||
+
|
||||
+/*
|
||||
+ * For file page tracking, we enable/disable it according to the main switch.
|
||||
+ * For anon page tracking, we only enabled it when the main switch is on and
|
||||
+ * there is at least one swapfile; we disable it when there are no swapfiles
|
||||
+ * regardless of the value of the main switch. Otherwise, we will eventually
|
||||
+ * reach the max size of the sliding window and have to call inc_min_seq().
|
||||
+ */
|
||||
+void lru_gen_change_state(bool enable, bool main, bool swap)
|
||||
+{
|
||||
+ static DEFINE_MUTEX(state_mutex);
|
||||
+
|
||||
+ struct mem_cgroup *memcg;
|
||||
+
|
||||
+ mem_hotplug_begin();
|
||||
+ cgroup_lock();
|
||||
+ mutex_lock(&state_mutex);
|
||||
+
|
||||
+ if (swap) {
|
||||
+ if (enable)
|
||||
+ swap = !lru_gen_nr_swapfiles++;
|
||||
+ else
|
||||
+ swap = !--lru_gen_nr_swapfiles;
|
||||
+ }
|
||||
+
|
||||
+ if (main && enable != lru_gen_enabled()) {
|
||||
+ if (enable)
|
||||
+ static_branch_enable(&lru_gen_static_key);
|
||||
+ else
|
||||
+ static_branch_disable(&lru_gen_static_key);
|
||||
+ } else if (!swap || !lru_gen_enabled())
|
||||
+ goto unlock;
|
||||
+
|
||||
+ memcg = mem_cgroup_iter(NULL, NULL, NULL);
|
||||
+ do {
|
||||
+ int nid;
|
||||
+
|
||||
+ for_each_node(nid) {
|
||||
+ struct lruvec *lruvec = get_lruvec(nid, memcg);
|
||||
+
|
||||
+ if (!lruvec)
|
||||
+ continue;
|
||||
+
|
||||
+ spin_lock_irq(&lruvec->lru_lock);
|
||||
+
|
||||
+ VM_BUG_ON(!seq_is_valid(lruvec));
|
||||
+ VM_BUG_ON(!state_is_valid(lruvec));
|
||||
+
|
||||
+ lruvec->evictable.enabled[0] = lru_gen_enabled() && lru_gen_nr_swapfiles;
|
||||
+ lruvec->evictable.enabled[1] = lru_gen_enabled();
|
||||
+
|
||||
+ while (!(enable ? fill_lists(lruvec) : drain_lists(lruvec))) {
|
||||
+ spin_unlock_irq(&lruvec->lru_lock);
|
||||
+ cond_resched();
|
||||
+ spin_lock_irq(&lruvec->lru_lock);
|
||||
+ }
|
||||
+
|
||||
+ spin_unlock_irq(&lruvec->lru_lock);
|
||||
+ }
|
||||
+
|
||||
+ cond_resched();
|
||||
+ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
|
||||
+unlock:
|
||||
+ mutex_unlock(&state_mutex);
|
||||
+ cgroup_unlock();
|
||||
+ mem_hotplug_done();
|
||||
+}
|
||||
+
|
||||
+/******************************************************************************
|
||||
+ * initialization
|
||||
+ ******************************************************************************/
|
||||
+
|
||||
+void lru_gen_init_state(struct mem_cgroup *memcg, struct lruvec *lruvec)
|
||||
+{
|
||||
+ int i;
|
||||
+ int gen, type, zone;
|
||||
+ struct lrugen *lrugen = &lruvec->evictable;
|
||||
+
|
||||
+ lrugen->max_seq = MIN_NR_GENS + 1;
|
||||
+ lrugen->enabled[0] = lru_gen_enabled() && lru_gen_nr_swapfiles;
|
||||
+ lrugen->enabled[1] = lru_gen_enabled();
|
||||
+
|
||||
+ for (i = 0; i <= MIN_NR_GENS + 1; i++)
|
||||
+ lrugen->timestamps[i] = jiffies;
|
||||
+
|
||||
+ for_each_gen_type_zone(gen, type, zone)
|
||||
+ INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]);
|
||||
+}
|
||||
+
|
||||
+#ifdef CONFIG_MEMCG
|
||||
+void lru_gen_init_memcg(struct mem_cgroup *memcg)
|
||||
+{
|
||||
+ int nid;
|
||||
+
|
||||
+ for_each_node(nid) {
|
||||
+ struct lruvec *lruvec = get_lruvec(nid, memcg);
|
||||
+
|
||||
+ lru_gen_init_state(memcg, lruvec);
|
||||
+ }
|
||||
+}
|
||||
+#endif
|
||||
+
|
||||
+static int __init init_lru_gen(void)
|
||||
+{
|
||||
+ BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS);
|
||||
+ BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS);
|
||||
+
|
||||
+ return 0;
|
||||
+};
|
||||
+late_initcall(init_lru_gen);
|
||||
+
|
||||
+#endif /* CONFIG_LRU_GEN */
|
||||
+
|
||||
static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
|
||||
{
|
||||
unsigned long nr[NR_LRU_LISTS];
|
@ -0,0 +1,760 @@
|
||||
From 534bcc4a0bb5b24600891ce793f0295a142e9dae Mon Sep 17 00:00:00 2001
|
||||
From: Yu Zhao <yuzhao@google.com>
|
||||
Date: Mon, 5 Apr 2021 04:17:41 -0600
|
||||
Subject: [PATCH 05/10] mm: multigenerational lru: mm_struct list
|
||||
|
||||
To scan PTEs for accessed pages, a mm_struct list is maintained for
|
||||
each memcg. When multiple threads traverse the same memcg->mm_list,
|
||||
each of them gets a unique mm_struct and therefore they can run
|
||||
walk_page_range() concurrently to reach page tables of all processes
|
||||
of this memcg.
|
||||
|
||||
This infrastructure also provides the following optimizations:
|
||||
1) it allows walkers to skip processes that have been sleeping since
|
||||
the last walk by tracking the usage of mm_struct between context
|
||||
switches.
|
||||
2) it allows walkers to add interesting items they find during a
|
||||
walk to a Bloom filter so that they can skip uninteresting items
|
||||
during the next walk by testing whether an item is in this Bloom
|
||||
filter.
|
||||
|
||||
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
||||
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
|
||||
Change-Id: I25d9eda8c6bdc7c3653b9f210a159d6c247c81e8
|
||||
---
|
||||
fs/exec.c | 2 +
|
||||
include/linux/memcontrol.h | 4 +
|
||||
include/linux/mm_inline.h | 6 +
|
||||
include/linux/mm_types.h | 75 +++++++++
|
||||
include/linux/mmzone.h | 63 +++++++
|
||||
kernel/exit.c | 1 +
|
||||
kernel/fork.c | 9 +
|
||||
kernel/sched/core.c | 1 +
|
||||
mm/memcontrol.c | 25 +++
|
||||
mm/vmscan.c | 331 +++++++++++++++++++++++++++++++++++++
|
||||
10 files changed, 517 insertions(+)
|
||||
|
||||
--- a/fs/exec.c
|
||||
+++ b/fs/exec.c
|
||||
@@ -1013,6 +1013,7 @@ static int exec_mmap(struct mm_struct *m
|
||||
active_mm = tsk->active_mm;
|
||||
tsk->active_mm = mm;
|
||||
tsk->mm = mm;
|
||||
+ lru_gen_add_mm(mm);
|
||||
/*
|
||||
* This prevents preemption while active_mm is being loaded and
|
||||
* it and mm are being updated, which could cause problems for
|
||||
@@ -1023,6 +1024,7 @@ static int exec_mmap(struct mm_struct *m
|
||||
if (!IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM))
|
||||
local_irq_enable();
|
||||
activate_mm(active_mm, mm);
|
||||
+ lru_gen_activate_mm(mm);
|
||||
if (IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM))
|
||||
local_irq_enable();
|
||||
tsk->mm->vmacache_seqnum = 0;
|
||||
--- a/include/linux/memcontrol.h
|
||||
+++ b/include/linux/memcontrol.h
|
||||
@@ -348,6 +348,10 @@ struct mem_cgroup {
|
||||
struct deferred_split deferred_split_queue;
|
||||
#endif
|
||||
|
||||
+#ifdef CONFIG_LRU_GEN
|
||||
+ struct lru_gen_mm_list mm_list;
|
||||
+#endif
|
||||
+
|
||||
struct mem_cgroup_per_node *nodeinfo[];
|
||||
};
|
||||
|
||||
--- a/include/linux/mm_inline.h
|
||||
+++ b/include/linux/mm_inline.h
|
||||
@@ -100,6 +100,12 @@ static inline int lru_gen_from_seq(unsig
|
||||
return seq % MAX_NR_GENS;
|
||||
}
|
||||
|
||||
+/* Return a proper index regardless whether we keep stats for historical generations. */
|
||||
+static inline int lru_hist_from_seq(unsigned long seq)
|
||||
+{
|
||||
+ return seq % NR_HIST_GENS;
|
||||
+}
|
||||
+
|
||||
/* The youngest and the second youngest generations are counted as active. */
|
||||
static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen)
|
||||
{
|
||||
--- a/include/linux/mm_types.h
|
||||
+++ b/include/linux/mm_types.h
|
||||
@@ -3,6 +3,7 @@
|
||||
#define _LINUX_MM_TYPES_H
|
||||
|
||||
#include <linux/mm_types_task.h>
|
||||
+#include <linux/sched.h>
|
||||
|
||||
#include <linux/auxvec.h>
|
||||
#include <linux/list.h>
|
||||
@@ -15,6 +16,8 @@
|
||||
#include <linux/page-flags-layout.h>
|
||||
#include <linux/workqueue.h>
|
||||
#include <linux/seqlock.h>
|
||||
+#include <linux/nodemask.h>
|
||||
+#include <linux/mmdebug.h>
|
||||
|
||||
#include <asm/mmu.h>
|
||||
|
||||
@@ -580,6 +583,18 @@ struct mm_struct {
|
||||
#ifdef CONFIG_IOMMU_SUPPORT
|
||||
u32 pasid;
|
||||
#endif
|
||||
+#ifdef CONFIG_LRU_GEN
|
||||
+ struct {
|
||||
+ /* the node of a global or per-memcg mm_struct list */
|
||||
+ struct list_head list;
|
||||
+#ifdef CONFIG_MEMCG
|
||||
+ /* points to the memcg of the owner task above */
|
||||
+ struct mem_cgroup *memcg;
|
||||
+#endif
|
||||
+ /* whether this mm_struct has been used since the last walk */
|
||||
+ nodemask_t nodes;
|
||||
+ } lrugen;
|
||||
+#endif /* CONFIG_LRU_GEN */
|
||||
} __randomize_layout;
|
||||
|
||||
/*
|
||||
@@ -606,6 +621,66 @@ static inline cpumask_t *mm_cpumask(stru
|
||||
return (struct cpumask *)&mm->cpu_bitmap;
|
||||
}
|
||||
|
||||
+#ifdef CONFIG_LRU_GEN
|
||||
+
|
||||
+struct lru_gen_mm_list {
|
||||
+ /* a global or per-memcg mm_struct list */
|
||||
+ struct list_head fifo;
|
||||
+ /* protects the list above */
|
||||
+ spinlock_t lock;
|
||||
+};
|
||||
+
|
||||
+void lru_gen_add_mm(struct mm_struct *mm);
|
||||
+void lru_gen_del_mm(struct mm_struct *mm);
|
||||
+#ifdef CONFIG_MEMCG
|
||||
+void lru_gen_migrate_mm(struct mm_struct *mm);
|
||||
+#endif
|
||||
+
|
||||
+static inline void lru_gen_init_mm(struct mm_struct *mm)
|
||||
+{
|
||||
+ INIT_LIST_HEAD(&mm->lrugen.list);
|
||||
+#ifdef CONFIG_MEMCG
|
||||
+ mm->lrugen.memcg = NULL;
|
||||
+#endif
|
||||
+ nodes_clear(mm->lrugen.nodes);
|
||||
+}
|
||||
+
|
||||
+/* Track the usage of each mm_struct so that we can skip inactive ones. */
|
||||
+static inline void lru_gen_activate_mm(struct mm_struct *mm)
|
||||
+{
|
||||
+ /* unlikely but not a bug when racing with lru_gen_migrate_mm() */
|
||||
+ VM_WARN_ON(list_empty(&mm->lrugen.list));
|
||||
+
|
||||
+ if (!(current->flags & PF_KTHREAD) && !nodes_full(mm->lrugen.nodes))
|
||||
+ nodes_setall(mm->lrugen.nodes);
|
||||
+}
|
||||
+
|
||||
+#else /* !CONFIG_LRU_GEN */
|
||||
+
|
||||
+static inline void lru_gen_add_mm(struct mm_struct *mm)
|
||||
+{
|
||||
+}
|
||||
+
|
||||
+static inline void lru_gen_del_mm(struct mm_struct *mm)
|
||||
+{
|
||||
+}
|
||||
+
|
||||
+#ifdef CONFIG_MEMCG
|
||||
+static inline void lru_gen_migrate_mm(struct mm_struct *mm)
|
||||
+{
|
||||
+}
|
||||
+#endif
|
||||
+
|
||||
+static inline void lru_gen_init_mm(struct mm_struct *mm)
|
||||
+{
|
||||
+}
|
||||
+
|
||||
+static inline void lru_gen_activate_mm(struct mm_struct *mm)
|
||||
+{
|
||||
+}
|
||||
+
|
||||
+#endif /* CONFIG_LRU_GEN */
|
||||
+
|
||||
struct mmu_gather;
|
||||
extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm);
|
||||
extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm);
|
||||
--- a/include/linux/mmzone.h
|
||||
+++ b/include/linux/mmzone.h
|
||||
@@ -318,6 +318,13 @@ struct lruvec;
|
||||
#define MIN_NR_GENS 2
|
||||
#define MAX_NR_GENS ((unsigned int)CONFIG_NR_LRU_GENS)
|
||||
|
||||
+/* Whether to keep stats for historical generations. */
|
||||
+#ifdef CONFIG_LRU_GEN_STATS
|
||||
+#define NR_HIST_GENS ((unsigned int)CONFIG_NR_LRU_GENS)
|
||||
+#else
|
||||
+#define NR_HIST_GENS 1U
|
||||
+#endif
|
||||
+
|
||||
struct lrugen {
|
||||
/* the aging increments the max generation number */
|
||||
unsigned long max_seq;
|
||||
@@ -333,13 +340,63 @@ struct lrugen {
|
||||
bool enabled[ANON_AND_FILE];
|
||||
};
|
||||
|
||||
+enum {
|
||||
+ MM_LEAF_TOTAL, /* total leaf entries */
|
||||
+ MM_LEAF_OLD, /* old leaf entries */
|
||||
+ MM_LEAF_YOUNG, /* young leaf entries */
|
||||
+ MM_NONLEAF_TOTAL, /* total non-leaf entries */
|
||||
+ MM_NONLEAF_PREV, /* previously worthy non-leaf entries */
|
||||
+ MM_NONLEAF_CUR, /* currently worthy non-leaf entries */
|
||||
+ NR_MM_STATS
|
||||
+};
|
||||
+
|
||||
+/* mnemonic codes for the stats above */
|
||||
+#define MM_STAT_CODES "toydpc"
|
||||
+
|
||||
+/* double buffering bloom filters */
|
||||
+#define NR_BLOOM_FILTERS 2
|
||||
+
|
||||
+struct lru_gen_mm_walk {
|
||||
+ /* set to max_seq after each round of walk */
|
||||
+ unsigned long seq;
|
||||
+ /* the next mm_struct on the list to walk */
|
||||
+ struct list_head *head;
|
||||
+ /* the first mm_struct never walked before */
|
||||
+ struct list_head *tail;
|
||||
+ /* to wait for the last walker to finish */
|
||||
+ struct wait_queue_head wait;
|
||||
+ /* bloom filters flip after each round of walk */
|
||||
+ unsigned long *filters[NR_BLOOM_FILTERS];
|
||||
+ /* page table stats for debugging */
|
||||
+ unsigned long stats[NR_HIST_GENS][NR_MM_STATS];
|
||||
+ /* the number of concurrent walkers */
|
||||
+ int nr_walkers;
|
||||
+};
|
||||
+
|
||||
+#define MIN_BATCH_SIZE 64
|
||||
#define MAX_BATCH_SIZE 8192
|
||||
|
||||
+struct mm_walk_args {
|
||||
+ struct mem_cgroup *memcg;
|
||||
+ unsigned long max_seq;
|
||||
+ unsigned long start_pfn;
|
||||
+ unsigned long end_pfn;
|
||||
+ unsigned long next_addr;
|
||||
+ unsigned long bitmap[BITS_TO_LONGS(MIN_BATCH_SIZE)];
|
||||
+ int node_id;
|
||||
+ int swappiness;
|
||||
+ int batch_size;
|
||||
+ int nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
|
||||
+ int mm_stats[NR_MM_STATS];
|
||||
+ bool use_filter;
|
||||
+};
|
||||
+
|
||||
void lru_gen_init_state(struct mem_cgroup *memcg, struct lruvec *lruvec);
|
||||
void lru_gen_change_state(bool enable, bool main, bool swap);
|
||||
|
||||
#ifdef CONFIG_MEMCG
|
||||
void lru_gen_init_memcg(struct mem_cgroup *memcg);
|
||||
+void lru_gen_free_memcg(struct mem_cgroup *memcg);
|
||||
#endif
|
||||
|
||||
#else /* !CONFIG_LRU_GEN */
|
||||
@@ -356,6 +413,10 @@ static inline void lru_gen_change_state(
|
||||
static inline void lru_gen_init_memcg(struct mem_cgroup *memcg)
|
||||
{
|
||||
}
|
||||
+
|
||||
+static inline void lru_gen_free_memcg(struct mem_cgroup *memcg)
|
||||
+{
|
||||
+}
|
||||
#endif
|
||||
|
||||
#endif /* CONFIG_LRU_GEN */
|
||||
@@ -380,6 +441,8 @@ struct lruvec {
|
||||
#ifdef CONFIG_LRU_GEN
|
||||
/* unevictable pages are on LRU_UNEVICTABLE */
|
||||
struct lrugen evictable;
|
||||
+ /* state for mm list and page table walks */
|
||||
+ struct lru_gen_mm_walk mm_walk;
|
||||
#endif
|
||||
#ifdef CONFIG_MEMCG
|
||||
struct pglist_data *pgdat;
|
||||
--- a/kernel/exit.c
|
||||
+++ b/kernel/exit.c
|
||||
@@ -422,6 +422,7 @@ assign_new_owner:
|
||||
goto retry;
|
||||
}
|
||||
WRITE_ONCE(mm->owner, c);
|
||||
+ lru_gen_migrate_mm(mm);
|
||||
task_unlock(c);
|
||||
put_task_struct(c);
|
||||
}
|
||||
--- a/kernel/fork.c
|
||||
+++ b/kernel/fork.c
|
||||
@@ -1080,6 +1080,7 @@ static struct mm_struct *mm_init(struct
|
||||
goto fail_nocontext;
|
||||
|
||||
mm->user_ns = get_user_ns(user_ns);
|
||||
+ lru_gen_init_mm(mm);
|
||||
return mm;
|
||||
|
||||
fail_nocontext:
|
||||
@@ -1122,6 +1123,7 @@ static inline void __mmput(struct mm_str
|
||||
}
|
||||
if (mm->binfmt)
|
||||
module_put(mm->binfmt->module);
|
||||
+ lru_gen_del_mm(mm);
|
||||
mmdrop(mm);
|
||||
}
|
||||
|
||||
@@ -2616,6 +2618,13 @@ pid_t kernel_clone(struct kernel_clone_a
|
||||
get_task_struct(p);
|
||||
}
|
||||
|
||||
+ if (IS_ENABLED(CONFIG_LRU_GEN) && !(clone_flags & CLONE_VM)) {
|
||||
+ /* lock the task to synchronize with memcg migration */
|
||||
+ task_lock(p);
|
||||
+ lru_gen_add_mm(p->mm);
|
||||
+ task_unlock(p);
|
||||
+ }
|
||||
+
|
||||
wake_up_new_task(p);
|
||||
|
||||
/* forking complete and child started to run, tell ptracer */
|
||||
--- a/kernel/sched/core.c
|
||||
+++ b/kernel/sched/core.c
|
||||
@@ -4978,6 +4978,7 @@ context_switch(struct rq *rq, struct tas
|
||||
* finish_task_switch()'s mmdrop().
|
||||
*/
|
||||
switch_mm_irqs_off(prev->active_mm, next->mm, next);
|
||||
+ lru_gen_activate_mm(next->mm);
|
||||
|
||||
if (!prev->mm) { // from kernel
|
||||
/* will mmdrop() in finish_task_switch(). */
|
||||
--- a/mm/memcontrol.c
|
||||
+++ b/mm/memcontrol.c
|
||||
@@ -5163,6 +5163,7 @@ static void __mem_cgroup_free(struct mem
|
||||
|
||||
static void mem_cgroup_free(struct mem_cgroup *memcg)
|
||||
{
|
||||
+ lru_gen_free_memcg(memcg);
|
||||
memcg_wb_domain_exit(memcg);
|
||||
__mem_cgroup_free(memcg);
|
||||
}
|
||||
@@ -6195,6 +6196,29 @@ static void mem_cgroup_move_task(void)
|
||||
}
|
||||
#endif
|
||||
|
||||
+#ifdef CONFIG_LRU_GEN
|
||||
+static void mem_cgroup_attach(struct cgroup_taskset *tset)
|
||||
+{
|
||||
+ struct cgroup_subsys_state *css;
|
||||
+ struct task_struct *task = NULL;
|
||||
+
|
||||
+ cgroup_taskset_for_each_leader(task, css, tset)
|
||||
+ break;
|
||||
+
|
||||
+ if (!task)
|
||||
+ return;
|
||||
+
|
||||
+ task_lock(task);
|
||||
+ if (task->mm && task->mm->owner == task)
|
||||
+ lru_gen_migrate_mm(task->mm);
|
||||
+ task_unlock(task);
|
||||
+}
|
||||
+#else
|
||||
+static void mem_cgroup_attach(struct cgroup_taskset *tset)
|
||||
+{
|
||||
+}
|
||||
+#endif /* CONFIG_LRU_GEN */
|
||||
+
|
||||
static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value)
|
||||
{
|
||||
if (value == PAGE_COUNTER_MAX)
|
||||
@@ -6538,6 +6562,7 @@ struct cgroup_subsys memory_cgrp_subsys
|
||||
.css_reset = mem_cgroup_css_reset,
|
||||
.css_rstat_flush = mem_cgroup_css_rstat_flush,
|
||||
.can_attach = mem_cgroup_can_attach,
|
||||
+ .attach = mem_cgroup_attach,
|
||||
.cancel_attach = mem_cgroup_cancel_attach,
|
||||
.post_attach = mem_cgroup_move_task,
|
||||
.dfl_cftypes = memory_files,
|
||||
--- a/mm/vmscan.c
|
||||
+++ b/mm/vmscan.c
|
||||
@@ -2929,6 +2929,306 @@ static bool __maybe_unused seq_is_valid(
|
||||
}
|
||||
|
||||
/******************************************************************************
|
||||
+ * mm_struct list
|
||||
+ ******************************************************************************/
|
||||
+
|
||||
+static struct lru_gen_mm_list *get_mm_list(struct mem_cgroup *memcg)
|
||||
+{
|
||||
+ static struct lru_gen_mm_list mm_list = {
|
||||
+ .fifo = LIST_HEAD_INIT(mm_list.fifo),
|
||||
+ .lock = __SPIN_LOCK_UNLOCKED(mm_list.lock),
|
||||
+ };
|
||||
+
|
||||
+#ifdef CONFIG_MEMCG
|
||||
+ if (memcg)
|
||||
+ return &memcg->mm_list;
|
||||
+#endif
|
||||
+ return &mm_list;
|
||||
+}
|
||||
+
|
||||
+void lru_gen_add_mm(struct mm_struct *mm)
|
||||
+{
|
||||
+ int nid;
|
||||
+ struct mem_cgroup *memcg = get_mem_cgroup_from_mm(mm);
|
||||
+ struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
|
||||
+
|
||||
+ VM_BUG_ON_MM(!list_empty(&mm->lrugen.list), mm);
|
||||
+#ifdef CONFIG_MEMCG
|
||||
+ VM_BUG_ON_MM(mm->lrugen.memcg, mm);
|
||||
+ mm->lrugen.memcg = memcg;
|
||||
+#endif
|
||||
+ spin_lock(&mm_list->lock);
|
||||
+
|
||||
+ list_add_tail(&mm->lrugen.list, &mm_list->fifo);
|
||||
+
|
||||
+ for_each_node(nid) {
|
||||
+ struct lruvec *lruvec = get_lruvec(nid, memcg);
|
||||
+
|
||||
+ if (!lruvec)
|
||||
+ continue;
|
||||
+
|
||||
+ if (lruvec->mm_walk.tail == &mm_list->fifo)
|
||||
+ lruvec->mm_walk.tail = lruvec->mm_walk.tail->prev;
|
||||
+ }
|
||||
+
|
||||
+ spin_unlock(&mm_list->lock);
|
||||
+}
|
||||
+
|
||||
+void lru_gen_del_mm(struct mm_struct *mm)
|
||||
+{
|
||||
+ int nid;
|
||||
+ struct lru_gen_mm_list *mm_list;
|
||||
+ struct mem_cgroup *memcg = NULL;
|
||||
+
|
||||
+ if (list_empty(&mm->lrugen.list))
|
||||
+ return;
|
||||
+
|
||||
+#ifdef CONFIG_MEMCG
|
||||
+ memcg = mm->lrugen.memcg;
|
||||
+#endif
|
||||
+ mm_list = get_mm_list(memcg);
|
||||
+
|
||||
+ spin_lock(&mm_list->lock);
|
||||
+
|
||||
+ for_each_node(nid) {
|
||||
+ struct lruvec *lruvec = get_lruvec(nid, memcg);
|
||||
+
|
||||
+ if (!lruvec)
|
||||
+ continue;
|
||||
+
|
||||
+ if (lruvec->mm_walk.tail == &mm->lrugen.list)
|
||||
+ lruvec->mm_walk.tail = lruvec->mm_walk.tail->next;
|
||||
+
|
||||
+ if (lruvec->mm_walk.head != &mm->lrugen.list)
|
||||
+ continue;
|
||||
+
|
||||
+ lruvec->mm_walk.head = lruvec->mm_walk.head->next;
|
||||
+ if (lruvec->mm_walk.head == &mm_list->fifo)
|
||||
+ WRITE_ONCE(lruvec->mm_walk.seq, lruvec->mm_walk.seq + 1);
|
||||
+ }
|
||||
+
|
||||
+ list_del_init(&mm->lrugen.list);
|
||||
+
|
||||
+ spin_unlock(&mm_list->lock);
|
||||
+
|
||||
+#ifdef CONFIG_MEMCG
|
||||
+ mem_cgroup_put(mm->lrugen.memcg);
|
||||
+ mm->lrugen.memcg = NULL;
|
||||
+#endif
|
||||
+}
|
||||
+
|
||||
+#ifdef CONFIG_MEMCG
|
||||
+void lru_gen_migrate_mm(struct mm_struct *mm)
|
||||
+{
|
||||
+ struct mem_cgroup *memcg;
|
||||
+
|
||||
+ lockdep_assert_held(&mm->owner->alloc_lock);
|
||||
+
|
||||
+ if (mem_cgroup_disabled())
|
||||
+ return;
|
||||
+
|
||||
+ rcu_read_lock();
|
||||
+ memcg = mem_cgroup_from_task(mm->owner);
|
||||
+ rcu_read_unlock();
|
||||
+ if (memcg == mm->lrugen.memcg)
|
||||
+ return;
|
||||
+
|
||||
+ VM_BUG_ON_MM(!mm->lrugen.memcg, mm);
|
||||
+ VM_BUG_ON_MM(list_empty(&mm->lrugen.list), mm);
|
||||
+
|
||||
+ lru_gen_del_mm(mm);
|
||||
+ lru_gen_add_mm(mm);
|
||||
+}
|
||||
+#endif
|
||||
+
|
||||
+#define BLOOM_FILTER_SHIFT 15
|
||||
+
|
||||
+static inline int filter_gen_from_seq(unsigned long seq)
|
||||
+{
|
||||
+ return seq % NR_BLOOM_FILTERS;
|
||||
+}
|
||||
+
|
||||
+static void get_item_key(void *item, int *key)
|
||||
+{
|
||||
+ u32 hash = hash_ptr(item, BLOOM_FILTER_SHIFT * 2);
|
||||
+
|
||||
+ BUILD_BUG_ON(BLOOM_FILTER_SHIFT * 2 > BITS_PER_TYPE(u32));
|
||||
+
|
||||
+ key[0] = hash & (BIT(BLOOM_FILTER_SHIFT) - 1);
|
||||
+ key[1] = hash >> BLOOM_FILTER_SHIFT;
|
||||
+}
|
||||
+
|
||||
+static void clear_bloom_filter(struct lruvec *lruvec, unsigned long seq)
|
||||
+{
|
||||
+ unsigned long *filter;
|
||||
+ int gen = filter_gen_from_seq(seq);
|
||||
+
|
||||
+ lockdep_assert_held(&get_mm_list(lruvec_memcg(lruvec))->lock);
|
||||
+
|
||||
+ filter = lruvec->mm_walk.filters[gen];
|
||||
+ if (filter) {
|
||||
+ bitmap_clear(filter, 0, BIT(BLOOM_FILTER_SHIFT));
|
||||
+ return;
|
||||
+ }
|
||||
+
|
||||
+ filter = bitmap_zalloc(BIT(BLOOM_FILTER_SHIFT), GFP_ATOMIC);
|
||||
+ WRITE_ONCE(lruvec->mm_walk.filters[gen], filter);
|
||||
+}
|
||||
+
|
||||
+static void set_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item)
|
||||
+{
|
||||
+ int key[2];
|
||||
+ unsigned long *filter;
|
||||
+ int gen = filter_gen_from_seq(seq);
|
||||
+
|
||||
+ filter = READ_ONCE(lruvec->mm_walk.filters[gen]);
|
||||
+ if (!filter)
|
||||
+ return;
|
||||
+
|
||||
+ get_item_key(item, key);
|
||||
+
|
||||
+ if (!test_bit(key[0], filter))
|
||||
+ set_bit(key[0], filter);
|
||||
+ if (!test_bit(key[1], filter))
|
||||
+ set_bit(key[1], filter);
|
||||
+}
|
||||
+
|
||||
+static bool test_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item)
|
||||
+{
|
||||
+ int key[2];
|
||||
+ unsigned long *filter;
|
||||
+ int gen = filter_gen_from_seq(seq);
|
||||
+
|
||||
+ filter = READ_ONCE(lruvec->mm_walk.filters[gen]);
|
||||
+ if (!filter)
|
||||
+ return false;
|
||||
+
|
||||
+ get_item_key(item, key);
|
||||
+
|
||||
+ return test_bit(key[0], filter) && test_bit(key[1], filter);
|
||||
+}
|
||||
+
|
||||
+static void reset_mm_stats(struct lruvec *lruvec, bool last, struct mm_walk_args *args)
|
||||
+{
|
||||
+ int i;
|
||||
+ int hist = lru_hist_from_seq(args->max_seq);
|
||||
+
|
||||
+ lockdep_assert_held(&get_mm_list(lruvec_memcg(lruvec))->lock);
|
||||
+
|
||||
+ for (i = 0; i < NR_MM_STATS; i++) {
|
||||
+ WRITE_ONCE(lruvec->mm_walk.stats[hist][i],
|
||||
+ lruvec->mm_walk.stats[hist][i] + args->mm_stats[i]);
|
||||
+ args->mm_stats[i] = 0;
|
||||
+ }
|
||||
+
|
||||
+ if (!last || NR_HIST_GENS == 1)
|
||||
+ return;
|
||||
+
|
||||
+ hist = lru_hist_from_seq(args->max_seq + 1);
|
||||
+ for (i = 0; i < NR_MM_STATS; i++)
|
||||
+ WRITE_ONCE(lruvec->mm_walk.stats[hist][i], 0);
|
||||
+}
|
||||
+
|
||||
+static bool should_skip_mm(struct mm_struct *mm, struct mm_walk_args *args)
|
||||
+{
|
||||
+ int type;
|
||||
+ unsigned long size = 0;
|
||||
+
|
||||
+ if (cpumask_empty(mm_cpumask(mm)) && !node_isset(args->node_id, mm->lrugen.nodes))
|
||||
+ return true;
|
||||
+
|
||||
+ if (mm_is_oom_victim(mm))
|
||||
+ return true;
|
||||
+
|
||||
+ for (type = !args->swappiness; type < ANON_AND_FILE; type++) {
|
||||
+ size += type ? get_mm_counter(mm, MM_FILEPAGES) :
|
||||
+ get_mm_counter(mm, MM_ANONPAGES) +
|
||||
+ get_mm_counter(mm, MM_SHMEMPAGES);
|
||||
+ }
|
||||
+
|
||||
+ if (size < MIN_BATCH_SIZE)
|
||||
+ return true;
|
||||
+
|
||||
+ if (!mmget_not_zero(mm))
|
||||
+ return true;
|
||||
+
|
||||
+ node_clear(args->node_id, mm->lrugen.nodes);
|
||||
+
|
||||
+ return false;
|
||||
+}
|
||||
+
|
||||
+/* To support multiple walkers that concurrently walk an mm_struct list. */
|
||||
+static bool get_next_mm(struct lruvec *lruvec, struct mm_walk_args *args,
|
||||
+ struct mm_struct **iter)
|
||||
+{
|
||||
+ bool first = false;
|
||||
+ bool last = true;
|
||||
+ struct mm_struct *mm = NULL;
|
||||
+ struct lru_gen_mm_walk *mm_walk = &lruvec->mm_walk;
|
||||
+ struct lru_gen_mm_list *mm_list = get_mm_list(args->memcg);
|
||||
+
|
||||
+ if (*iter)
|
||||
+ mmput_async(*iter);
|
||||
+ else if (args->max_seq <= READ_ONCE(mm_walk->seq))
|
||||
+ return false;
|
||||
+
|
||||
+ spin_lock(&mm_list->lock);
|
||||
+
|
||||
+ VM_BUG_ON(args->max_seq > mm_walk->seq + 1);
|
||||
+ VM_BUG_ON(*iter && args->max_seq < mm_walk->seq);
|
||||
+ VM_BUG_ON(*iter && !mm_walk->nr_walkers);
|
||||
+
|
||||
+ if (args->max_seq <= mm_walk->seq) {
|
||||
+ if (!*iter)
|
||||
+ last = false;
|
||||
+ goto done;
|
||||
+ }
|
||||
+
|
||||
+ if (mm_walk->head == &mm_list->fifo) {
|
||||
+ VM_BUG_ON(mm_walk->nr_walkers);
|
||||
+ mm_walk->head = mm_walk->head->next;
|
||||
+ first = true;
|
||||
+ }
|
||||
+
|
||||
+ while (!mm && mm_walk->head != &mm_list->fifo) {
|
||||
+ mm = list_entry(mm_walk->head, struct mm_struct, lrugen.list);
|
||||
+
|
||||
+ mm_walk->head = mm_walk->head->next;
|
||||
+
|
||||
+ if (mm_walk->tail == &mm->lrugen.list) {
|
||||
+ mm_walk->tail = mm_walk->tail->next;
|
||||
+ args->use_filter = false;
|
||||
+ }
|
||||
+
|
||||
+ if (should_skip_mm(mm, args))
|
||||
+ mm = NULL;
|
||||
+ }
|
||||
+
|
||||
+ if (mm_walk->head == &mm_list->fifo)
|
||||
+ WRITE_ONCE(mm_walk->seq, mm_walk->seq + 1);
|
||||
+done:
|
||||
+ if (*iter && !mm)
|
||||
+ mm_walk->nr_walkers--;
|
||||
+ if (!*iter && mm)
|
||||
+ mm_walk->nr_walkers++;
|
||||
+
|
||||
+ if (mm_walk->nr_walkers)
|
||||
+ last = false;
|
||||
+
|
||||
+ if (mm && first)
|
||||
+ clear_bloom_filter(lruvec, args->max_seq + 1);
|
||||
+
|
||||
+ if (*iter || last)
|
||||
+ reset_mm_stats(lruvec, last, args);
|
||||
+
|
||||
+ spin_unlock(&mm_list->lock);
|
||||
+
|
||||
+ *iter = mm;
|
||||
+
|
||||
+ return last;
|
||||
+}
|
||||
+
|
||||
+/******************************************************************************
|
||||
* state change
|
||||
******************************************************************************/
|
||||
|
||||
@@ -3112,6 +3412,7 @@ void lru_gen_init_state(struct mem_cgrou
|
||||
int i;
|
||||
int gen, type, zone;
|
||||
struct lrugen *lrugen = &lruvec->evictable;
|
||||
+ struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
|
||||
|
||||
lrugen->max_seq = MIN_NR_GENS + 1;
|
||||
lrugen->enabled[0] = lru_gen_enabled() && lru_gen_nr_swapfiles;
|
||||
@@ -3122,6 +3423,17 @@ void lru_gen_init_state(struct mem_cgrou
|
||||
|
||||
for_each_gen_type_zone(gen, type, zone)
|
||||
INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]);
|
||||
+
|
||||
+ if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG) && !memcg)
|
||||
+ spin_lock(&mm_list->lock);
|
||||
+
|
||||
+ lruvec->mm_walk.seq = MIN_NR_GENS;
|
||||
+ lruvec->mm_walk.head = &mm_list->fifo;
|
||||
+ lruvec->mm_walk.tail = &mm_list->fifo;
|
||||
+ init_waitqueue_head(&lruvec->mm_walk.wait);
|
||||
+
|
||||
+ if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG) && !memcg)
|
||||
+ spin_unlock(&mm_list->lock);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_MEMCG
|
||||
@@ -3129,18 +3441,37 @@ void lru_gen_init_memcg(struct mem_cgrou
|
||||
{
|
||||
int nid;
|
||||
|
||||
+ INIT_LIST_HEAD(&memcg->mm_list.fifo);
|
||||
+ spin_lock_init(&memcg->mm_list.lock);
|
||||
+
|
||||
for_each_node(nid) {
|
||||
struct lruvec *lruvec = get_lruvec(nid, memcg);
|
||||
|
||||
lru_gen_init_state(memcg, lruvec);
|
||||
}
|
||||
}
|
||||
+
|
||||
+void lru_gen_free_memcg(struct mem_cgroup *memcg)
|
||||
+{
|
||||
+ int nid;
|
||||
+
|
||||
+ for_each_node(nid) {
|
||||
+ int i;
|
||||
+ struct lruvec *lruvec = get_lruvec(nid, memcg);
|
||||
+
|
||||
+ for (i = 0; i < NR_BLOOM_FILTERS; i++) {
|
||||
+ bitmap_free(lruvec->mm_walk.filters[i]);
|
||||
+ lruvec->mm_walk.filters[i] = NULL;
|
||||
+ }
|
||||
+ }
|
||||
+}
|
||||
#endif
|
||||
|
||||
static int __init init_lru_gen(void)
|
||||
{
|
||||
BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS);
|
||||
BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS);
|
||||
+ BUILD_BUG_ON(sizeof(MM_STAT_CODES) != NR_MM_STATS + 1);
|
||||
|
||||
return 0;
|
||||
};
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,496 @@
|
||||
From 5cc7fdec54e87e32b4fb0f07d84b21769d5f8d92 Mon Sep 17 00:00:00 2001
|
||||
From: Yu Zhao <yuzhao@google.com>
|
||||
Date: Mon, 25 Jan 2021 21:38:02 -0700
|
||||
Subject: [PATCH 08/10] mm: multigenerational lru: user interface
|
||||
|
||||
Add /sys/kernel/mm/lru_gen/enabled to enable and disable the
|
||||
multigenerational lru at runtime.
|
||||
|
||||
Add /sys/kernel/mm/lru_gen/min_ttl_ms to protect the working set of a
|
||||
given number of milliseconds. The OOM killer is invoked if this
|
||||
working set cannot be kept in memory.
|
||||
|
||||
Add /sys/kernel/debug/lru_gen to monitor the multigenerational lru and
|
||||
invoke the aging and the eviction. This file has the following output:
|
||||
memcg memcg_id memcg_path
|
||||
node node_id
|
||||
min_gen birth_time anon_size file_size
|
||||
...
|
||||
max_gen birth_time anon_size file_size
|
||||
|
||||
min_gen is the oldest generation number and max_gen is the youngest
|
||||
generation number. birth_time is in milliseconds. anon_size and
|
||||
file_size are in pages.
|
||||
|
||||
This file takes the following input:
|
||||
+ memcg_id node_id max_gen [swappiness] [use_bloom_filter]
|
||||
- memcg_id node_id min_gen [swappiness] [nr_to_reclaim]
|
||||
|
||||
The first command line invokes the aging, which scans PTEs for
|
||||
accessed pages and then creates the next generation max_gen+1. A swap
|
||||
file and a non-zero swappiness, which overrides vm.swappiness, are
|
||||
required to scan PTEs mapping anon pages. The second command line
|
||||
invokes the eviction, which evicts generations less than or equal to
|
||||
min_gen. min_gen should be less than max_gen-1 as max_gen and
|
||||
max_gen-1 are not fully aged and therefore cannot be evicted.
|
||||
Setting nr_to_reclaim to N limits the number of pages to evict.
|
||||
Setting use_bloom_filter to 0 overrides the default behavior which
|
||||
only scans PTE tables found populated. Multiple command lines are
|
||||
supported, as is concatenation with delimiters "," and ";".
|
||||
|
||||
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
||||
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
|
||||
Change-Id: I4448e60029badbe347aa3b624f429b280cc3a3d3
|
||||
---
|
||||
include/linux/nodemask.h | 1 +
|
||||
mm/vmscan.c | 415 +++++++++++++++++++++++++++++++++++++++
|
||||
2 files changed, 416 insertions(+)
|
||||
|
||||
--- a/include/linux/nodemask.h
|
||||
+++ b/include/linux/nodemask.h
|
||||
@@ -485,6 +485,7 @@ static inline int num_node_state(enum no
|
||||
#define first_online_node 0
|
||||
#define first_memory_node 0
|
||||
#define next_online_node(nid) (MAX_NUMNODES)
|
||||
+#define next_memory_node(nid) (MAX_NUMNODES)
|
||||
#define nr_node_ids 1U
|
||||
#define nr_online_nodes 1U
|
||||
|
||||
--- a/mm/vmscan.c
|
||||
+++ b/mm/vmscan.c
|
||||
@@ -53,6 +53,8 @@
|
||||
#include <linux/memory.h>
|
||||
#include <linux/pagewalk.h>
|
||||
#include <linux/shmem_fs.h>
|
||||
+#include <linux/ctype.h>
|
||||
+#include <linux/debugfs.h>
|
||||
|
||||
#include <asm/tlbflush.h>
|
||||
#include <asm/div64.h>
|
||||
@@ -4882,6 +4884,413 @@ unlock:
|
||||
}
|
||||
|
||||
/******************************************************************************
|
||||
+ * sysfs interface
|
||||
+ ******************************************************************************/
|
||||
+
|
||||
+static ssize_t show_min_ttl(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
|
||||
+{
|
||||
+ return sprintf(buf, "%u\n", jiffies_to_msecs(READ_ONCE(lru_gen_min_ttl)));
|
||||
+}
|
||||
+
|
||||
+static ssize_t store_min_ttl(struct kobject *kobj, struct kobj_attribute *attr,
|
||||
+ const char *buf, size_t len)
|
||||
+{
|
||||
+ unsigned int msecs;
|
||||
+
|
||||
+ if (kstrtouint(buf, 10, &msecs))
|
||||
+ return -EINVAL;
|
||||
+
|
||||
+ WRITE_ONCE(lru_gen_min_ttl, msecs_to_jiffies(msecs));
|
||||
+
|
||||
+ return len;
|
||||
+}
|
||||
+
|
||||
+static struct kobj_attribute lru_gen_min_ttl_attr = __ATTR(
|
||||
+ min_ttl_ms, 0644, show_min_ttl, store_min_ttl
|
||||
+);
|
||||
+
|
||||
+static ssize_t show_enable(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
|
||||
+{
|
||||
+ return snprintf(buf, PAGE_SIZE, "%d\n", lru_gen_enabled());
|
||||
+}
|
||||
+
|
||||
+static ssize_t store_enable(struct kobject *kobj, struct kobj_attribute *attr,
|
||||
+ const char *buf, size_t len)
|
||||
+{
|
||||
+ bool enable;
|
||||
+
|
||||
+ if (kstrtobool(buf, &enable))
|
||||
+ return -EINVAL;
|
||||
+
|
||||
+ lru_gen_change_state(enable, true, false);
|
||||
+
|
||||
+ return len;
|
||||
+}
|
||||
+
|
||||
+static struct kobj_attribute lru_gen_enabled_attr = __ATTR(
|
||||
+ enabled, 0644, show_enable, store_enable
|
||||
+);
|
||||
+
|
||||
+static struct attribute *lru_gen_attrs[] = {
|
||||
+ &lru_gen_min_ttl_attr.attr,
|
||||
+ &lru_gen_enabled_attr.attr,
|
||||
+ NULL
|
||||
+};
|
||||
+
|
||||
+static struct attribute_group lru_gen_attr_group = {
|
||||
+ .name = "lru_gen",
|
||||
+ .attrs = lru_gen_attrs,
|
||||
+};
|
||||
+
|
||||
+/******************************************************************************
|
||||
+ * debugfs interface
|
||||
+ ******************************************************************************/
|
||||
+
|
||||
+static void *lru_gen_seq_start(struct seq_file *m, loff_t *pos)
|
||||
+{
|
||||
+ struct mem_cgroup *memcg;
|
||||
+ loff_t nr_to_skip = *pos;
|
||||
+
|
||||
+ m->private = kvmalloc(PATH_MAX, GFP_KERNEL);
|
||||
+ if (!m->private)
|
||||
+ return ERR_PTR(-ENOMEM);
|
||||
+
|
||||
+ memcg = mem_cgroup_iter(NULL, NULL, NULL);
|
||||
+ do {
|
||||
+ int nid;
|
||||
+
|
||||
+ for_each_node_state(nid, N_MEMORY) {
|
||||
+ if (!nr_to_skip--)
|
||||
+ return get_lruvec(nid, memcg);
|
||||
+ }
|
||||
+ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
|
||||
+
|
||||
+ return NULL;
|
||||
+}
|
||||
+
|
||||
+static void lru_gen_seq_stop(struct seq_file *m, void *v)
|
||||
+{
|
||||
+ if (!IS_ERR_OR_NULL(v))
|
||||
+ mem_cgroup_iter_break(NULL, lruvec_memcg(v));
|
||||
+
|
||||
+ kvfree(m->private);
|
||||
+ m->private = NULL;
|
||||
+}
|
||||
+
|
||||
+static void *lru_gen_seq_next(struct seq_file *m, void *v, loff_t *pos)
|
||||
+{
|
||||
+ int nid = lruvec_pgdat(v)->node_id;
|
||||
+ struct mem_cgroup *memcg = lruvec_memcg(v);
|
||||
+
|
||||
+ ++*pos;
|
||||
+
|
||||
+ nid = next_memory_node(nid);
|
||||
+ if (nid == MAX_NUMNODES) {
|
||||
+ memcg = mem_cgroup_iter(NULL, memcg, NULL);
|
||||
+ if (!memcg)
|
||||
+ return NULL;
|
||||
+
|
||||
+ nid = first_memory_node;
|
||||
+ }
|
||||
+
|
||||
+ return get_lruvec(nid, memcg);
|
||||
+}
|
||||
+
|
||||
+static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec,
|
||||
+ unsigned long max_seq, unsigned long *min_seq,
|
||||
+ unsigned long seq)
|
||||
+{
|
||||
+ int i;
|
||||
+ int type, tier;
|
||||
+ int hist = lru_hist_from_seq(seq);
|
||||
+ struct lrugen *lrugen = &lruvec->evictable;
|
||||
+
|
||||
+ for (tier = 0; tier < MAX_NR_TIERS; tier++) {
|
||||
+ seq_printf(m, " %10d", tier);
|
||||
+ for (type = 0; type < ANON_AND_FILE; type++) {
|
||||
+ unsigned long n[3] = {};
|
||||
+
|
||||
+ if (seq == max_seq) {
|
||||
+ n[0] = READ_ONCE(lrugen->avg_refaulted[type][tier]);
|
||||
+ n[1] = READ_ONCE(lrugen->avg_total[type][tier]);
|
||||
+
|
||||
+ seq_printf(m, " %10luR %10luT %10lu ", n[0], n[1], n[2]);
|
||||
+ } else if (seq == min_seq[type] || NR_HIST_GENS > 1) {
|
||||
+ n[0] = atomic_long_read(&lrugen->refaulted[hist][type][tier]);
|
||||
+ n[1] = atomic_long_read(&lrugen->evicted[hist][type][tier]);
|
||||
+ if (tier)
|
||||
+ n[2] = READ_ONCE(lrugen->protected[hist][type][tier - 1]);
|
||||
+
|
||||
+ seq_printf(m, " %10lur %10lue %10lup", n[0], n[1], n[2]);
|
||||
+ } else
|
||||
+ seq_puts(m, " 0 0 0 ");
|
||||
+ }
|
||||
+ seq_putc(m, '\n');
|
||||
+ }
|
||||
+
|
||||
+ seq_puts(m, " ");
|
||||
+ for (i = 0; i < NR_MM_STATS; i++) {
|
||||
+ if (seq == max_seq && NR_HIST_GENS == 1)
|
||||
+ seq_printf(m, " %10lu%c", READ_ONCE(lruvec->mm_walk.stats[hist][i]),
|
||||
+ toupper(MM_STAT_CODES[i]));
|
||||
+ else if (seq != max_seq && NR_HIST_GENS > 1)
|
||||
+ seq_printf(m, " %10lu%c", READ_ONCE(lruvec->mm_walk.stats[hist][i]),
|
||||
+ MM_STAT_CODES[i]);
|
||||
+ else
|
||||
+ seq_puts(m, " 0 ");
|
||||
+ }
|
||||
+ seq_putc(m, '\n');
|
||||
+}
|
||||
+
|
||||
+static int lru_gen_seq_show(struct seq_file *m, void *v)
|
||||
+{
|
||||
+ unsigned long seq;
|
||||
+ bool full = !debugfs_real_fops(m->file)->write;
|
||||
+ struct lruvec *lruvec = v;
|
||||
+ struct lrugen *lrugen = &lruvec->evictable;
|
||||
+ int nid = lruvec_pgdat(lruvec)->node_id;
|
||||
+ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
|
||||
+ DEFINE_MAX_SEQ(lruvec);
|
||||
+ DEFINE_MIN_SEQ(lruvec);
|
||||
+
|
||||
+ if (nid == first_memory_node) {
|
||||
+ const char *path = memcg ? m->private : "";
|
||||
+
|
||||
+#ifdef CONFIG_MEMCG
|
||||
+ if (memcg)
|
||||
+ cgroup_path(memcg->css.cgroup, m->private, PATH_MAX);
|
||||
+#endif
|
||||
+ seq_printf(m, "memcg %5hu %s\n", mem_cgroup_id(memcg), path);
|
||||
+ }
|
||||
+
|
||||
+ seq_printf(m, " node %5d\n", nid);
|
||||
+
|
||||
+ if (!full)
|
||||
+ seq = min_seq[0];
|
||||
+ else if (max_seq >= MAX_NR_GENS)
|
||||
+ seq = max_seq - MAX_NR_GENS + 1;
|
||||
+ else
|
||||
+ seq = 0;
|
||||
+
|
||||
+ for (; seq <= max_seq; seq++) {
|
||||
+ int gen, type, zone;
|
||||
+ unsigned int msecs;
|
||||
+
|
||||
+ gen = lru_gen_from_seq(seq);
|
||||
+ msecs = jiffies_to_msecs(jiffies - READ_ONCE(lrugen->timestamps[gen]));
|
||||
+
|
||||
+ seq_printf(m, " %10lu %10u", seq, msecs);
|
||||
+
|
||||
+ for (type = 0; type < ANON_AND_FILE; type++) {
|
||||
+ long size = 0;
|
||||
+
|
||||
+ if (seq < min_seq[type]) {
|
||||
+ seq_puts(m, " -0 ");
|
||||
+ continue;
|
||||
+ }
|
||||
+
|
||||
+ for (zone = 0; zone < MAX_NR_ZONES; zone++)
|
||||
+ size += READ_ONCE(lrugen->sizes[gen][type][zone]);
|
||||
+
|
||||
+ seq_printf(m, " %10lu ", max(size, 0L));
|
||||
+ }
|
||||
+
|
||||
+ seq_putc(m, '\n');
|
||||
+
|
||||
+ if (full)
|
||||
+ lru_gen_seq_show_full(m, lruvec, max_seq, min_seq, seq);
|
||||
+ }
|
||||
+
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
+static const struct seq_operations lru_gen_seq_ops = {
|
||||
+ .start = lru_gen_seq_start,
|
||||
+ .stop = lru_gen_seq_stop,
|
||||
+ .next = lru_gen_seq_next,
|
||||
+ .show = lru_gen_seq_show,
|
||||
+};
|
||||
+
|
||||
+static int run_aging(struct lruvec *lruvec, struct scan_control *sc, int swappiness,
|
||||
+ unsigned long seq, bool use_filter)
|
||||
+{
|
||||
+ DEFINE_MAX_SEQ(lruvec);
|
||||
+
|
||||
+ if (seq == max_seq)
|
||||
+ try_to_inc_max_seq(lruvec, sc, swappiness, max_seq, use_filter);
|
||||
+
|
||||
+ return seq > max_seq ? -EINVAL : 0;
|
||||
+}
|
||||
+
|
||||
+static int run_eviction(struct lruvec *lruvec, struct scan_control *sc, int swappiness,
|
||||
+ unsigned long seq, unsigned long nr_to_reclaim)
|
||||
+{
|
||||
+ struct blk_plug plug;
|
||||
+ int err = -EINTR;
|
||||
+ DEFINE_MAX_SEQ(lruvec);
|
||||
+
|
||||
+ if (seq >= max_seq - 1)
|
||||
+ return -EINVAL;
|
||||
+
|
||||
+ sc->nr_reclaimed = 0;
|
||||
+
|
||||
+ blk_start_plug(&plug);
|
||||
+
|
||||
+ while (!signal_pending(current)) {
|
||||
+ DEFINE_MIN_SEQ(lruvec);
|
||||
+
|
||||
+ if (seq < min_seq[!swappiness] || sc->nr_reclaimed >= nr_to_reclaim ||
|
||||
+ !evict_pages(lruvec, sc, swappiness)) {
|
||||
+ err = 0;
|
||||
+ break;
|
||||
+ }
|
||||
+
|
||||
+ cond_resched();
|
||||
+ }
|
||||
+
|
||||
+ blk_finish_plug(&plug);
|
||||
+
|
||||
+ return err;
|
||||
+}
|
||||
+
|
||||
+static int run_cmd(char cmd, int memcg_id, int nid, struct scan_control *sc,
|
||||
+ int swappiness, unsigned long seq, unsigned long opt)
|
||||
+{
|
||||
+ struct lruvec *lruvec;
|
||||
+ int err = -EINVAL;
|
||||
+ struct mem_cgroup *memcg = NULL;
|
||||
+
|
||||
+ if (!mem_cgroup_disabled()) {
|
||||
+ rcu_read_lock();
|
||||
+ memcg = mem_cgroup_from_id(memcg_id);
|
||||
+#ifdef CONFIG_MEMCG
|
||||
+ if (memcg && !css_tryget(&memcg->css))
|
||||
+ memcg = NULL;
|
||||
+#endif
|
||||
+ rcu_read_unlock();
|
||||
+
|
||||
+ if (!memcg)
|
||||
+ goto done;
|
||||
+ }
|
||||
+ if (memcg_id != mem_cgroup_id(memcg))
|
||||
+ goto done;
|
||||
+
|
||||
+ if (nid < 0 || nid >= MAX_NUMNODES || !node_state(nid, N_MEMORY))
|
||||
+ goto done;
|
||||
+
|
||||
+ lruvec = get_lruvec(nid, memcg);
|
||||
+
|
||||
+ if (swappiness < 0)
|
||||
+ swappiness = get_swappiness(memcg);
|
||||
+ else if (swappiness > 200)
|
||||
+ goto done;
|
||||
+
|
||||
+ switch (cmd) {
|
||||
+ case '+':
|
||||
+ err = run_aging(lruvec, sc, swappiness, seq, opt);
|
||||
+ break;
|
||||
+ case '-':
|
||||
+ err = run_eviction(lruvec, sc, swappiness, seq, opt);
|
||||
+ break;
|
||||
+ }
|
||||
+done:
|
||||
+ mem_cgroup_put(memcg);
|
||||
+
|
||||
+ return err;
|
||||
+}
|
||||
+
|
||||
+static ssize_t lru_gen_seq_write(struct file *file, const char __user *src,
|
||||
+ size_t len, loff_t *pos)
|
||||
+{
|
||||
+ void *buf;
|
||||
+ char *cur, *next;
|
||||
+ unsigned int flags;
|
||||
+ int err = 0;
|
||||
+ struct scan_control sc = {
|
||||
+ .may_writepage = 1,
|
||||
+ .may_unmap = 1,
|
||||
+ .may_swap = 1,
|
||||
+ .reclaim_idx = MAX_NR_ZONES - 1,
|
||||
+ .gfp_mask = GFP_KERNEL,
|
||||
+ };
|
||||
+
|
||||
+ buf = kvmalloc(len + 1, GFP_KERNEL);
|
||||
+ if (!buf)
|
||||
+ return -ENOMEM;
|
||||
+
|
||||
+ if (copy_from_user(buf, src, len)) {
|
||||
+ kvfree(buf);
|
||||
+ return -EFAULT;
|
||||
+ }
|
||||
+
|
||||
+ next = buf;
|
||||
+ next[len] = '\0';
|
||||
+
|
||||
+ sc.reclaim_state.mm_walk_args = alloc_mm_walk_args();
|
||||
+ if (!sc.reclaim_state.mm_walk_args) {
|
||||
+ kvfree(buf);
|
||||
+ return -ENOMEM;
|
||||
+ }
|
||||
+
|
||||
+ flags = memalloc_noreclaim_save();
|
||||
+ set_task_reclaim_state(current, &sc.reclaim_state);
|
||||
+
|
||||
+ while ((cur = strsep(&next, ",;\n"))) {
|
||||
+ int n;
|
||||
+ int end;
|
||||
+ char cmd;
|
||||
+ unsigned int memcg_id;
|
||||
+ unsigned int nid;
|
||||
+ unsigned long seq;
|
||||
+ unsigned int swappiness = -1;
|
||||
+ unsigned long opt = -1;
|
||||
+
|
||||
+ cur = skip_spaces(cur);
|
||||
+ if (!*cur)
|
||||
+ continue;
|
||||
+
|
||||
+ n = sscanf(cur, "%c %u %u %lu %n %u %n %lu %n", &cmd, &memcg_id, &nid,
|
||||
+ &seq, &end, &swappiness, &end, &opt, &end);
|
||||
+ if (n < 4 || cur[end]) {
|
||||
+ err = -EINVAL;
|
||||
+ break;
|
||||
+ }
|
||||
+
|
||||
+ err = run_cmd(cmd, memcg_id, nid, &sc, swappiness, seq, opt);
|
||||
+ if (err)
|
||||
+ break;
|
||||
+ }
|
||||
+
|
||||
+ set_task_reclaim_state(current, NULL);
|
||||
+ memalloc_noreclaim_restore(flags);
|
||||
+
|
||||
+ free_mm_walk_args(sc.reclaim_state.mm_walk_args);
|
||||
+ kvfree(buf);
|
||||
+
|
||||
+ return err ? : len;
|
||||
+}
|
||||
+
|
||||
+static int lru_gen_seq_open(struct inode *inode, struct file *file)
|
||||
+{
|
||||
+ return seq_open(file, &lru_gen_seq_ops);
|
||||
+}
|
||||
+
|
||||
+static const struct file_operations lru_gen_rw_fops = {
|
||||
+ .open = lru_gen_seq_open,
|
||||
+ .read = seq_read,
|
||||
+ .write = lru_gen_seq_write,
|
||||
+ .llseek = seq_lseek,
|
||||
+ .release = seq_release,
|
||||
+};
|
||||
+
|
||||
+static const struct file_operations lru_gen_ro_fops = {
|
||||
+ .open = lru_gen_seq_open,
|
||||
+ .read = seq_read,
|
||||
+ .llseek = seq_lseek,
|
||||
+ .release = seq_release,
|
||||
+};
|
||||
+
|
||||
+/******************************************************************************
|
||||
* initialization
|
||||
******************************************************************************/
|
||||
|
||||
@@ -4951,6 +5360,12 @@ static int __init init_lru_gen(void)
|
||||
BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS);
|
||||
BUILD_BUG_ON(sizeof(MM_STAT_CODES) != NR_MM_STATS + 1);
|
||||
|
||||
+ if (sysfs_create_group(mm_kobj, &lru_gen_attr_group))
|
||||
+ pr_err("lru_gen: failed to create sysfs group\n");
|
||||
+
|
||||
+ debugfs_create_file("lru_gen", 0644, NULL, NULL, &lru_gen_rw_fops);
|
||||
+ debugfs_create_file("lru_gen_full", 0444, NULL, NULL, &lru_gen_ro_fops);
|
||||
+
|
||||
return 0;
|
||||
};
|
||||
late_initcall(init_lru_gen);
|
@ -0,0 +1,80 @@
|
||||
From 3008095eb835d207dd7e5b60899aad17f32aa9f7 Mon Sep 17 00:00:00 2001
|
||||
From: Yu Zhao <yuzhao@google.com>
|
||||
Date: Mon, 25 Jan 2021 21:47:24 -0700
|
||||
Subject: [PATCH 09/10] mm: multigenerational lru: Kconfig
|
||||
|
||||
Add configuration options for the multigenerational lru.
|
||||
|
||||
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
||||
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
|
||||
Change-Id: Ic74ea07f8fb5f56e6904a1b80c3c286bc2911635
|
||||
---
|
||||
mm/Kconfig | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
|
||||
1 file changed, 59 insertions(+)
|
||||
|
||||
--- a/mm/Kconfig
|
||||
+++ b/mm/Kconfig
|
||||
@@ -899,4 +899,63 @@ config SECRETMEM
|
||||
|
||||
source "mm/damon/Kconfig"
|
||||
|
||||
+# the multigenerational lru {
|
||||
+config LRU_GEN
|
||||
+ bool "Multigenerational LRU"
|
||||
+ depends on MMU
|
||||
+ # the following options may leave not enough spare bits in page->flags
|
||||
+ depends on !MAXSMP && (64BIT || !SPARSEMEM || SPARSEMEM_VMEMMAP)
|
||||
+ help
|
||||
+ A high performance LRU implementation to heavily overcommit workloads
|
||||
+ that are not IO bound. See Documentation/vm/multigen_lru.rst for
|
||||
+ details.
|
||||
+
|
||||
+ Warning: do not enable this option unless you plan to use it because
|
||||
+ it introduces a small per-process and per-memcg and per-node memory
|
||||
+ overhead.
|
||||
+
|
||||
+config LRU_GEN_ENABLED
|
||||
+ bool "Turn on by default"
|
||||
+ depends on LRU_GEN
|
||||
+ help
|
||||
+ The default value of /sys/kernel/mm/lru_gen/enabled is 0. This option
|
||||
+ changes it to 1.
|
||||
+
|
||||
+ Warning: the default value is the fast path. See
|
||||
+ Documentation/static-keys.txt for details.
|
||||
+
|
||||
+config LRU_GEN_STATS
|
||||
+ bool "Full stats for debugging"
|
||||
+ depends on LRU_GEN
|
||||
+ help
|
||||
+ This option keeps full stats for each generation, which can be read
|
||||
+ from /sys/kernel/debug/lru_gen_full.
|
||||
+
|
||||
+ Warning: do not enable this option unless you plan to use it because
|
||||
+ it introduces an additional small per-process and per-memcg and
|
||||
+ per-node memory overhead.
|
||||
+
|
||||
+config NR_LRU_GENS
|
||||
+ int "Max number of generations"
|
||||
+ depends on LRU_GEN
|
||||
+ range 4 31
|
||||
+ default 7
|
||||
+ help
|
||||
+ This will use order_base_2(N+1) spare bits from page flags.
|
||||
+
|
||||
+ Warning: do not use numbers larger than necessary because each
|
||||
+ generation introduces a small per-node and per-memcg memory overhead.
|
||||
+
|
||||
+config TIERS_PER_GEN
|
||||
+ int "Number of tiers per generation"
|
||||
+ depends on LRU_GEN
|
||||
+ range 2 5
|
||||
+ default 4
|
||||
+ help
|
||||
+ This will use N-2 spare bits from page flags.
|
||||
+
|
||||
+ Larger values generally offer better protection to active pages under
|
||||
+ heavy buffered I/O workloads.
|
||||
+# }
|
||||
+
|
||||
endmenu
|
@ -0,0 +1,161 @@
|
||||
From f59c618ed70a1e48accc4cad91a200966f2569c9 Mon Sep 17 00:00:00 2001
|
||||
From: Yu Zhao <yuzhao@google.com>
|
||||
Date: Tue, 2 Feb 2021 01:27:45 -0700
|
||||
Subject: [PATCH 10/10] mm: multigenerational lru: documentation
|
||||
|
||||
Add Documentation/vm/multigen_lru.rst.
|
||||
|
||||
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
||||
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
|
||||
Change-Id: I1902178bcbb5adfa0a748c4d284a6456059bdd7e
|
||||
---
|
||||
Documentation/vm/index.rst | 1 +
|
||||
Documentation/vm/multigen_lru.rst | 132 ++++++++++++++++++++++++++++++
|
||||
2 files changed, 133 insertions(+)
|
||||
create mode 100644 Documentation/vm/multigen_lru.rst
|
||||
|
||||
--- a/Documentation/vm/index.rst
|
||||
+++ b/Documentation/vm/index.rst
|
||||
@@ -17,6 +17,7 @@ various features of the Linux memory man
|
||||
|
||||
swap_numa
|
||||
zswap
|
||||
+ multigen_lru
|
||||
|
||||
Kernel developers MM documentation
|
||||
==================================
|
||||
--- /dev/null
|
||||
+++ b/Documentation/vm/multigen_lru.rst
|
||||
@@ -0,0 +1,132 @@
|
||||
+.. SPDX-License-Identifier: GPL-2.0
|
||||
+
|
||||
+=====================
|
||||
+Multigenerational LRU
|
||||
+=====================
|
||||
+
|
||||
+Quick Start
|
||||
+===========
|
||||
+Build Configurations
|
||||
+--------------------
|
||||
+:Required: Set ``CONFIG_LRU_GEN=y``.
|
||||
+
|
||||
+:Optional: Set ``CONFIG_LRU_GEN_ENABLED=y`` to turn the feature on by
|
||||
+ default.
|
||||
+
|
||||
+Runtime Configurations
|
||||
+----------------------
|
||||
+:Required: Write ``1`` to ``/sys/kernel/mm/lru_gen/enable`` if the
|
||||
+ feature was not turned on by default.
|
||||
+
|
||||
+:Optional: Write ``N`` to ``/sys/kernel/mm/lru_gen/min_ttl_ms`` to
|
||||
+ protect the working set of ``N`` milliseconds. The OOM killer is
|
||||
+ invoked if this working set cannot be kept in memory.
|
||||
+
|
||||
+:Optional: Read ``/sys/kernel/debug/lru_gen`` to confirm the feature
|
||||
+ is turned on. This file has the following output:
|
||||
+
|
||||
+::
|
||||
+
|
||||
+ memcg memcg_id memcg_path
|
||||
+ node node_id
|
||||
+ min_gen birth_time anon_size file_size
|
||||
+ ...
|
||||
+ max_gen birth_time anon_size file_size
|
||||
+
|
||||
+``min_gen`` is the oldest generation number and ``max_gen`` is the
|
||||
+youngest generation number. ``birth_time`` is in milliseconds.
|
||||
+``anon_size`` and ``file_size`` are in pages.
|
||||
+
|
||||
+Phones/Laptops/Workstations
|
||||
+---------------------------
|
||||
+No additional configurations required.
|
||||
+
|
||||
+Servers/Data Centers
|
||||
+--------------------
|
||||
+:To support more generations: Change ``CONFIG_NR_LRU_GENS`` to a
|
||||
+ larger number.
|
||||
+
|
||||
+:To support more tiers: Change ``CONFIG_TIERS_PER_GEN`` to a larger
|
||||
+ number.
|
||||
+
|
||||
+:To support full stats: Set ``CONFIG_LRU_GEN_STATS=y``.
|
||||
+
|
||||
+:Working set estimation: Write ``+ memcg_id node_id max_gen
|
||||
+ [swappiness] [use_bloom_filter]`` to ``/sys/kernel/debug/lru_gen`` to
|
||||
+ invoke the aging, which scans PTEs for accessed pages and then
|
||||
+ creates the next generation ``max_gen+1``. A swap file and a non-zero
|
||||
+ ``swappiness``, which overrides ``vm.swappiness``, are required to
|
||||
+ scan PTEs mapping anon pages. Set ``use_bloom_filter`` to 0 to
|
||||
+ override the default behavior which only scans PTE tables found
|
||||
+ populated.
|
||||
+
|
||||
+:Proactive reclaim: Write ``- memcg_id node_id min_gen [swappiness]
|
||||
+ [nr_to_reclaim]`` to ``/sys/kernel/debug/lru_gen`` to invoke the
|
||||
+ eviction, which evicts generations less than or equal to ``min_gen``.
|
||||
+ ``min_gen`` should be less than ``max_gen-1`` as ``max_gen`` and
|
||||
+ ``max_gen-1`` are not fully aged and therefore cannot be evicted.
|
||||
+ Use ``nr_to_reclaim`` to limit the number of pages to evict. Multiple
|
||||
+ command lines are supported, so does concatenation with delimiters
|
||||
+ ``,`` and ``;``.
|
||||
+
|
||||
+Framework
|
||||
+=========
|
||||
+For each ``lruvec``, evictable pages are divided into multiple
|
||||
+generations. The youngest generation number is stored in
|
||||
+``lrugen->max_seq`` for both anon and file types as they are aged on
|
||||
+an equal footing. The oldest generation numbers are stored in
|
||||
+``lrugen->min_seq[]`` separately for anon and file types as clean
|
||||
+file pages can be evicted regardless of swap and writeback
|
||||
+constraints. These three variables are monotonically increasing.
|
||||
+Generation numbers are truncated into
|
||||
+``order_base_2(CONFIG_NR_LRU_GENS+1)`` bits in order to fit into
|
||||
+``page->flags``. The sliding window technique is used to prevent
|
||||
+truncated generation numbers from overlapping. Each truncated
|
||||
+generation number is an index to an array of per-type and per-zone
|
||||
+lists ``lrugen->lists``.
|
||||
+
|
||||
+Each generation is divided into multiple tiers. Tiers represent
|
||||
+different ranges of numbers of accesses from file descriptors only.
|
||||
+Pages accessed ``N`` times via file descriptors belong to tier
|
||||
+``order_base_2(N)``. Each generation contains at most
|
||||
+``CONFIG_TIERS_PER_GEN`` tiers, and they require additional
|
||||
+``CONFIG_TIERS_PER_GEN-2`` bits in ``page->flags``. In contrast to
|
||||
+moving between generations which requires list operations, moving
|
||||
+between tiers only involves operations on ``page->flags`` and
|
||||
+therefore has a negligible cost. A feedback loop modeled after the PID
|
||||
+controller monitors refaulted % across all tiers and decides when to
|
||||
+protect pages from which tiers.
|
||||
+
|
||||
+The framework comprises two conceptually independent components: the
|
||||
+aging and the eviction, which can be invoked separately from user
|
||||
+space for the purpose of working set estimation and proactive reclaim.
|
||||
+
|
||||
+Aging
|
||||
+-----
|
||||
+The aging produces young generations. Given an ``lruvec``, the aging
|
||||
+traverses ``lruvec_memcg()->mm_list`` and calls ``walk_page_range()``
|
||||
+to scan PTEs for accessed pages (a ``mm_struct`` list is maintained
|
||||
+for each ``memcg``). Upon finding one, the aging updates its
|
||||
+generation number to ``max_seq`` (modulo ``CONFIG_NR_LRU_GENS``).
|
||||
+After each round of traversal, the aging increments ``max_seq``. The
|
||||
+aging is due when ``min_seq[]`` reaches ``max_seq-1``.
|
||||
+
|
||||
+Eviction
|
||||
+--------
|
||||
+The eviction consumes old generations. Given an ``lruvec``, the
|
||||
+eviction scans pages on the per-zone lists indexed by anon and file
|
||||
+``min_seq[]`` (modulo ``CONFIG_NR_LRU_GENS``). It first tries to
|
||||
+select a type based on the values of ``min_seq[]``. If they are
|
||||
+equal, it selects the type that has a lower refaulted %. The eviction
|
||||
+sorts a page according to its updated generation number if the aging
|
||||
+has found this page accessed. It also moves a page to the next
|
||||
+generation if this page is from an upper tier that has a higher
|
||||
+refaulted % than the base tier. The eviction increments ``min_seq[]``
|
||||
+of a selected type when it finds all the per-zone lists indexed by
|
||||
+``min_seq[]`` of this selected type are empty.
|
||||
+
|
||||
+To-do List
|
||||
+==========
|
||||
+KVM Optimization
|
||||
+----------------
|
||||
+Support shadow page table walk.
|
Loading…
x
Reference in New Issue
Block a user