2023-03-20 21:51:03 +00:00
|
|
|
From 05223c4e80b34e29f2255c04ffebc2c4475e7593 Mon Sep 17 00:00:00 2001
|
|
|
|
From: Yu Zhao <yuzhao@google.com>
|
|
|
|
Date: Sun, 18 Sep 2022 02:00:05 -0600
|
|
|
|
Subject: [PATCH 08/29] mm: multi-gen LRU: support page table walks
|
|
|
|
MIME-Version: 1.0
|
|
|
|
Content-Type: text/plain; charset=UTF-8
|
|
|
|
Content-Transfer-Encoding: 8bit
|
|
|
|
|
|
|
|
To further exploit spatial locality, the aging prefers to walk page tables
|
|
|
|
to search for young PTEs and promote hot pages. A kill switch will be
|
|
|
|
added in the next patch to disable this behavior. When disabled, the
|
|
|
|
aging relies on the rmap only.
|
|
|
|
|
|
|
|
NB: this behavior has nothing similar with the page table scanning in the
|
|
|
|
2.4 kernel [1], which searches page tables for old PTEs, adds cold pages
|
|
|
|
to swapcache and unmaps them.
|
|
|
|
|
|
|
|
To avoid confusion, the term "iteration" specifically means the traversal
|
|
|
|
of an entire mm_struct list; the term "walk" will be applied to page
|
|
|
|
tables and the rmap, as usual.
|
|
|
|
|
|
|
|
An mm_struct list is maintained for each memcg, and an mm_struct follows
|
|
|
|
its owner task to the new memcg when this task is migrated. Given an
|
|
|
|
lruvec, the aging iterates lruvec_memcg()->mm_list and calls
|
|
|
|
walk_page_range() with each mm_struct on this list to promote hot pages
|
|
|
|
before it increments max_seq.
|
|
|
|
|
|
|
|
When multiple page table walkers iterate the same list, each of them gets
|
|
|
|
a unique mm_struct; therefore they can run concurrently. Page table
|
|
|
|
walkers ignore any misplaced pages, e.g., if an mm_struct was migrated,
|
|
|
|
pages it left in the previous memcg will not be promoted when its current
|
|
|
|
memcg is under reclaim. Similarly, page table walkers will not promote
|
|
|
|
pages from nodes other than the one under reclaim.
|
|
|
|
|
|
|
|
This patch uses the following optimizations when walking page tables:
|
|
|
|
1. It tracks the usage of mm_struct's between context switches so that
|
|
|
|
page table walkers can skip processes that have been sleeping since
|
|
|
|
the last iteration.
|
|
|
|
2. It uses generational Bloom filters to record populated branches so
|
|
|
|
that page table walkers can reduce their search space based on the
|
|
|
|
query results, e.g., to skip page tables containing mostly holes or
|
|
|
|
misplaced pages.
|
|
|
|
3. It takes advantage of the accessed bit in non-leaf PMD entries when
|
|
|
|
CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG=y.
|
|
|
|
4. It does not zigzag between a PGD table and the same PMD table
|
|
|
|
spanning multiple VMAs. IOW, it finishes all the VMAs within the
|
|
|
|
range of the same PMD table before it returns to a PGD table. This
|
|
|
|
improves the cache performance for workloads that have large
|
|
|
|
numbers of tiny VMAs [2], especially when CONFIG_PGTABLE_LEVELS=5.
|
|
|
|
|
|
|
|
Server benchmark results:
|
|
|
|
Single workload:
|
|
|
|
fio (buffered I/O): no change
|
|
|
|
|
|
|
|
Single workload:
|
|
|
|
memcached (anon): +[8, 10]%
|
|
|
|
Ops/sec KB/sec
|
|
|
|
patch1-7: 1147696.57 44640.29
|
|
|
|
patch1-8: 1245274.91 48435.66
|
|
|
|
|
|
|
|
Configurations:
|
|
|
|
no change
|
|
|
|
|
|
|
|
Client benchmark results:
|
|
|
|
kswapd profiles:
|
|
|
|
patch1-7
|
|
|
|
48.16% lzo1x_1_do_compress (real work)
|
|
|
|
8.20% page_vma_mapped_walk (overhead)
|
|
|
|
7.06% _raw_spin_unlock_irq
|
|
|
|
2.92% ptep_clear_flush
|
|
|
|
2.53% __zram_bvec_write
|
|
|
|
2.11% do_raw_spin_lock
|
|
|
|
2.02% memmove
|
|
|
|
1.93% lru_gen_look_around
|
|
|
|
1.56% free_unref_page_list
|
|
|
|
1.40% memset
|
|
|
|
|
|
|
|
patch1-8
|
|
|
|
49.44% lzo1x_1_do_compress (real work)
|
|
|
|
6.19% page_vma_mapped_walk (overhead)
|
|
|
|
5.97% _raw_spin_unlock_irq
|
|
|
|
3.13% get_pfn_page
|
|
|
|
2.85% ptep_clear_flush
|
|
|
|
2.42% __zram_bvec_write
|
|
|
|
2.08% do_raw_spin_lock
|
|
|
|
1.92% memmove
|
|
|
|
1.44% alloc_zspage
|
|
|
|
1.36% memset
|
|
|
|
|
|
|
|
Configurations:
|
|
|
|
no change
|
|
|
|
|
|
|
|
Thanks to the following developers for their efforts [3].
|
|
|
|
kernel test robot <lkp@intel.com>
|
|
|
|
|
|
|
|
[1] https://lwn.net/Articles/23732/
|
|
|
|
[2] https://llvm.org/docs/ScudoHardenedAllocator.html
|
|
|
|
[3] https://lore.kernel.org/r/202204160827.ekEARWQo-lkp@intel.com/
|
|
|
|
|
|
|
|
Link: https://lkml.kernel.org/r/20220918080010.2920238-9-yuzhao@google.com
|
|
|
|
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
|
|
|
Acked-by: Brian Geffon <bgeffon@google.com>
|
|
|
|
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
|
|
|
|
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
|
|
|
|
Acked-by: Steven Barrett <steven@liquorix.net>
|
|
|
|
Acked-by: Suleiman Souhlal <suleiman@google.com>
|
|
|
|
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
|
|
|
|
Tested-by: Donald Carr <d@chaos-reins.com>
|
|
|
|
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
|
|
|
|
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
|
|
|
|
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
|
|
|
|
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
|
|
|
|
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
|
|
|
|
Cc: Andi Kleen <ak@linux.intel.com>
|
|
|
|
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
|
|
|
|
Cc: Barry Song <baohua@kernel.org>
|
|
|
|
Cc: Catalin Marinas <catalin.marinas@arm.com>
|
|
|
|
Cc: Dave Hansen <dave.hansen@linux.intel.com>
|
|
|
|
Cc: Hillf Danton <hdanton@sina.com>
|
|
|
|
Cc: Jens Axboe <axboe@kernel.dk>
|
|
|
|
Cc: Johannes Weiner <hannes@cmpxchg.org>
|
|
|
|
Cc: Jonathan Corbet <corbet@lwn.net>
|
|
|
|
Cc: Linus Torvalds <torvalds@linux-foundation.org>
|
|
|
|
Cc: Matthew Wilcox <willy@infradead.org>
|
|
|
|
Cc: Mel Gorman <mgorman@suse.de>
|
|
|
|
Cc: Miaohe Lin <linmiaohe@huawei.com>
|
|
|
|
Cc: Michael Larabel <Michael@MichaelLarabel.com>
|
|
|
|
Cc: Michal Hocko <mhocko@kernel.org>
|
|
|
|
Cc: Mike Rapoport <rppt@kernel.org>
|
|
|
|
Cc: Mike Rapoport <rppt@linux.ibm.com>
|
|
|
|
Cc: Peter Zijlstra <peterz@infradead.org>
|
|
|
|
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
|
|
|
|
Cc: Tejun Heo <tj@kernel.org>
|
|
|
|
Cc: Vlastimil Babka <vbabka@suse.cz>
|
|
|
|
Cc: Will Deacon <will@kernel.org>
|
|
|
|
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
|
|
|
---
|
|
|
|
fs/exec.c | 2 +
|
|
|
|
include/linux/memcontrol.h | 5 +
|
|
|
|
include/linux/mm_types.h | 76 +++
|
|
|
|
include/linux/mmzone.h | 56 +-
|
|
|
|
include/linux/swap.h | 4 +
|
|
|
|
kernel/exit.c | 1 +
|
|
|
|
kernel/fork.c | 9 +
|
|
|
|
kernel/sched/core.c | 1 +
|
|
|
|
mm/memcontrol.c | 25 +
|
|
|
|
mm/vmscan.c | 1010 +++++++++++++++++++++++++++++++++++-
|
|
|
|
10 files changed, 1172 insertions(+), 17 deletions(-)
|
|
|
|
|
|
|
|
--- a/fs/exec.c
|
|
|
|
+++ b/fs/exec.c
|
2023-03-25 16:24:27 +00:00
|
|
|
@@ -1013,6 +1013,7 @@ static int exec_mmap(struct mm_struct *m
|
2023-03-20 21:51:03 +00:00
|
|
|
active_mm = tsk->active_mm;
|
|
|
|
tsk->active_mm = mm;
|
|
|
|
tsk->mm = mm;
|
|
|
|
+ lru_gen_add_mm(mm);
|
|
|
|
/*
|
|
|
|
* This prevents preemption while active_mm is being loaded and
|
|
|
|
* it and mm are being updated, which could cause problems for
|
2023-03-25 16:24:27 +00:00
|
|
|
@@ -1028,6 +1029,7 @@ static int exec_mmap(struct mm_struct *m
|
2023-03-20 21:51:03 +00:00
|
|
|
tsk->mm->vmacache_seqnum = 0;
|
|
|
|
vmacache_flush(tsk);
|
|
|
|
task_unlock(tsk);
|
|
|
|
+ lru_gen_use_mm(mm);
|
|
|
|
if (old_mm) {
|
|
|
|
mmap_read_unlock(old_mm);
|
|
|
|
BUG_ON(active_mm != old_mm);
|
|
|
|
--- a/include/linux/memcontrol.h
|
|
|
|
+++ b/include/linux/memcontrol.h
|
|
|
|
@@ -348,6 +348,11 @@ struct mem_cgroup {
|
|
|
|
struct deferred_split deferred_split_queue;
|
|
|
|
#endif
|
|
|
|
|
|
|
|
+#ifdef CONFIG_LRU_GEN
|
|
|
|
+ /* per-memcg mm_struct list */
|
|
|
|
+ struct lru_gen_mm_list mm_list;
|
|
|
|
+#endif
|
|
|
|
+
|
|
|
|
struct mem_cgroup_per_node *nodeinfo[];
|
|
|
|
};
|
|
|
|
|
|
|
|
--- a/include/linux/mm_types.h
|
|
|
|
+++ b/include/linux/mm_types.h
|
|
|
|
@@ -580,6 +580,22 @@ struct mm_struct {
|
|
|
|
#ifdef CONFIG_IOMMU_SUPPORT
|
|
|
|
u32 pasid;
|
|
|
|
#endif
|
|
|
|
+#ifdef CONFIG_LRU_GEN
|
|
|
|
+ struct {
|
|
|
|
+ /* this mm_struct is on lru_gen_mm_list */
|
|
|
|
+ struct list_head list;
|
|
|
|
+ /*
|
|
|
|
+ * Set when switching to this mm_struct, as a hint of
|
|
|
|
+ * whether it has been used since the last time per-node
|
|
|
|
+ * page table walkers cleared the corresponding bits.
|
|
|
|
+ */
|
|
|
|
+ unsigned long bitmap;
|
|
|
|
+#ifdef CONFIG_MEMCG
|
|
|
|
+ /* points to the memcg of "owner" above */
|
|
|
|
+ struct mem_cgroup *memcg;
|
|
|
|
+#endif
|
|
|
|
+ } lru_gen;
|
|
|
|
+#endif /* CONFIG_LRU_GEN */
|
|
|
|
} __randomize_layout;
|
|
|
|
|
|
|
|
/*
|
2023-03-25 16:24:27 +00:00
|
|
|
@@ -606,6 +622,66 @@ static inline cpumask_t *mm_cpumask(stru
|
2023-03-20 21:51:03 +00:00
|
|
|
return (struct cpumask *)&mm->cpu_bitmap;
|
|
|
|
}
|
|
|
|
|
|
|
|
+#ifdef CONFIG_LRU_GEN
|
|
|
|
+
|
|
|
|
+struct lru_gen_mm_list {
|
|
|
|
+ /* mm_struct list for page table walkers */
|
|
|
|
+ struct list_head fifo;
|
|
|
|
+ /* protects the list above */
|
|
|
|
+ spinlock_t lock;
|
|
|
|
+};
|
|
|
|
+
|
|
|
|
+void lru_gen_add_mm(struct mm_struct *mm);
|
|
|
|
+void lru_gen_del_mm(struct mm_struct *mm);
|
|
|
|
+#ifdef CONFIG_MEMCG
|
|
|
|
+void lru_gen_migrate_mm(struct mm_struct *mm);
|
|
|
|
+#endif
|
|
|
|
+
|
|
|
|
+static inline void lru_gen_init_mm(struct mm_struct *mm)
|
|
|
|
+{
|
|
|
|
+ INIT_LIST_HEAD(&mm->lru_gen.list);
|
|
|
|
+ mm->lru_gen.bitmap = 0;
|
|
|
|
+#ifdef CONFIG_MEMCG
|
|
|
|
+ mm->lru_gen.memcg = NULL;
|
|
|
|
+#endif
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+static inline void lru_gen_use_mm(struct mm_struct *mm)
|
|
|
|
+{
|
|
|
|
+ /*
|
|
|
|
+ * When the bitmap is set, page reclaim knows this mm_struct has been
|
|
|
|
+ * used since the last time it cleared the bitmap. So it might be worth
|
|
|
|
+ * walking the page tables of this mm_struct to clear the accessed bit.
|
|
|
|
+ */
|
|
|
|
+ WRITE_ONCE(mm->lru_gen.bitmap, -1);
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+#else /* !CONFIG_LRU_GEN */
|
|
|
|
+
|
|
|
|
+static inline void lru_gen_add_mm(struct mm_struct *mm)
|
|
|
|
+{
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+static inline void lru_gen_del_mm(struct mm_struct *mm)
|
|
|
|
+{
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+#ifdef CONFIG_MEMCG
|
|
|
|
+static inline void lru_gen_migrate_mm(struct mm_struct *mm)
|
|
|
|
+{
|
|
|
|
+}
|
|
|
|
+#endif
|
|
|
|
+
|
|
|
|
+static inline void lru_gen_init_mm(struct mm_struct *mm)
|
|
|
|
+{
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+static inline void lru_gen_use_mm(struct mm_struct *mm)
|
|
|
|
+{
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+#endif /* CONFIG_LRU_GEN */
|
|
|
|
+
|
|
|
|
struct mmu_gather;
|
|
|
|
extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm);
|
|
|
|
extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm);
|
|
|
|
--- a/include/linux/mmzone.h
|
|
|
|
+++ b/include/linux/mmzone.h
|
|
|
|
@@ -385,7 +385,7 @@ enum {
|
|
|
|
* min_seq behind.
|
|
|
|
*
|
|
|
|
* The number of pages in each generation is eventually consistent and therefore
|
|
|
|
- * can be transiently negative.
|
|
|
|
+ * can be transiently negative when reset_batch_size() is pending.
|
|
|
|
*/
|
|
|
|
struct lru_gen_struct {
|
|
|
|
/* the aging increments the youngest generation number */
|
|
|
|
@@ -407,6 +407,53 @@ struct lru_gen_struct {
|
|
|
|
atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
|
|
|
|
};
|
|
|
|
|
|
|
|
+enum {
|
|
|
|
+ MM_LEAF_TOTAL, /* total leaf entries */
|
|
|
|
+ MM_LEAF_OLD, /* old leaf entries */
|
|
|
|
+ MM_LEAF_YOUNG, /* young leaf entries */
|
|
|
|
+ MM_NONLEAF_TOTAL, /* total non-leaf entries */
|
|
|
|
+ MM_NONLEAF_FOUND, /* non-leaf entries found in Bloom filters */
|
|
|
|
+ MM_NONLEAF_ADDED, /* non-leaf entries added to Bloom filters */
|
|
|
|
+ NR_MM_STATS
|
|
|
|
+};
|
|
|
|
+
|
|
|
|
+/* double-buffering Bloom filters */
|
|
|
|
+#define NR_BLOOM_FILTERS 2
|
|
|
|
+
|
|
|
|
+struct lru_gen_mm_state {
|
|
|
|
+ /* set to max_seq after each iteration */
|
|
|
|
+ unsigned long seq;
|
|
|
|
+ /* where the current iteration continues (inclusive) */
|
|
|
|
+ struct list_head *head;
|
|
|
|
+ /* where the last iteration ended (exclusive) */
|
|
|
|
+ struct list_head *tail;
|
|
|
|
+ /* to wait for the last page table walker to finish */
|
|
|
|
+ struct wait_queue_head wait;
|
|
|
|
+ /* Bloom filters flip after each iteration */
|
|
|
|
+ unsigned long *filters[NR_BLOOM_FILTERS];
|
|
|
|
+ /* the mm stats for debugging */
|
|
|
|
+ unsigned long stats[NR_HIST_GENS][NR_MM_STATS];
|
|
|
|
+ /* the number of concurrent page table walkers */
|
|
|
|
+ int nr_walkers;
|
|
|
|
+};
|
|
|
|
+
|
|
|
|
+struct lru_gen_mm_walk {
|
|
|
|
+ /* the lruvec under reclaim */
|
|
|
|
+ struct lruvec *lruvec;
|
|
|
|
+ /* unstable max_seq from lru_gen_struct */
|
|
|
|
+ unsigned long max_seq;
|
|
|
|
+ /* the next address within an mm to scan */
|
|
|
|
+ unsigned long next_addr;
|
|
|
|
+ /* to batch promoted pages */
|
|
|
|
+ int nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
|
|
|
|
+ /* to batch the mm stats */
|
|
|
|
+ int mm_stats[NR_MM_STATS];
|
|
|
|
+ /* total batched items */
|
|
|
|
+ int batched;
|
|
|
|
+ bool can_swap;
|
|
|
|
+ bool force_scan;
|
|
|
|
+};
|
|
|
|
+
|
|
|
|
void lru_gen_init_lruvec(struct lruvec *lruvec);
|
|
|
|
void lru_gen_look_around(struct page_vma_mapped_walk *pvmw);
|
|
|
|
|
|
|
|
@@ -457,6 +504,8 @@ struct lruvec {
|
|
|
|
#ifdef CONFIG_LRU_GEN
|
|
|
|
/* evictable pages divided into generations */
|
|
|
|
struct lru_gen_struct lrugen;
|
|
|
|
+ /* to concurrently iterate lru_gen_mm_list */
|
|
|
|
+ struct lru_gen_mm_state mm_state;
|
|
|
|
#endif
|
|
|
|
#ifdef CONFIG_MEMCG
|
|
|
|
struct pglist_data *pgdat;
|
|
|
|
@@ -1042,6 +1091,11 @@ typedef struct pglist_data {
|
|
|
|
|
|
|
|
unsigned long flags;
|
|
|
|
|
|
|
|
+#ifdef CONFIG_LRU_GEN
|
|
|
|
+ /* kswap mm walk data */
|
|
|
|
+ struct lru_gen_mm_walk mm_walk;
|
|
|
|
+#endif
|
|
|
|
+
|
|
|
|
ZONE_PADDING(_pad2_)
|
|
|
|
|
|
|
|
/* Per-node vmstats */
|
|
|
|
--- a/include/linux/swap.h
|
|
|
|
+++ b/include/linux/swap.h
|
|
|
|
@@ -137,6 +137,10 @@ union swap_header {
|
|
|
|
*/
|
|
|
|
struct reclaim_state {
|
|
|
|
unsigned long reclaimed_slab;
|
|
|
|
+#ifdef CONFIG_LRU_GEN
|
|
|
|
+ /* per-thread mm walk data */
|
|
|
|
+ struct lru_gen_mm_walk *mm_walk;
|
|
|
|
+#endif
|
|
|
|
};
|
|
|
|
|
|
|
|
#ifdef __KERNEL__
|
|
|
|
--- a/kernel/exit.c
|
|
|
|
+++ b/kernel/exit.c
|
2023-03-25 16:24:27 +00:00
|
|
|
@@ -469,6 +469,7 @@ assign_new_owner:
|
2023-03-20 21:51:03 +00:00
|
|
|
goto retry;
|
|
|
|
}
|
|
|
|
WRITE_ONCE(mm->owner, c);
|
|
|
|
+ lru_gen_migrate_mm(mm);
|
|
|
|
task_unlock(c);
|
|
|
|
put_task_struct(c);
|
|
|
|
}
|
|
|
|
--- a/kernel/fork.c
|
|
|
|
+++ b/kernel/fork.c
|
2023-03-25 16:24:27 +00:00
|
|
|
@@ -1083,6 +1083,7 @@ static struct mm_struct *mm_init(struct
|
2023-03-20 21:51:03 +00:00
|
|
|
goto fail_nocontext;
|
|
|
|
|
|
|
|
mm->user_ns = get_user_ns(user_ns);
|
|
|
|
+ lru_gen_init_mm(mm);
|
|
|
|
return mm;
|
|
|
|
|
|
|
|
fail_nocontext:
|
2023-03-25 16:24:27 +00:00
|
|
|
@@ -1125,6 +1126,7 @@ static inline void __mmput(struct mm_str
|
2023-03-20 21:51:03 +00:00
|
|
|
}
|
|
|
|
if (mm->binfmt)
|
|
|
|
module_put(mm->binfmt->module);
|
|
|
|
+ lru_gen_del_mm(mm);
|
|
|
|
mmdrop(mm);
|
|
|
|
}
|
|
|
|
|
2023-08-08 23:57:20 +00:00
|
|
|
@@ -2617,6 +2619,13 @@ pid_t kernel_clone(struct kernel_clone_a
|
2023-03-20 21:51:03 +00:00
|
|
|
get_task_struct(p);
|
|
|
|
}
|
|
|
|
|
|
|
|
+ if (IS_ENABLED(CONFIG_LRU_GEN) && !(clone_flags & CLONE_VM)) {
|
|
|
|
+ /* lock the task to synchronize with memcg migration */
|
|
|
|
+ task_lock(p);
|
|
|
|
+ lru_gen_add_mm(p->mm);
|
|
|
|
+ task_unlock(p);
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
wake_up_new_task(p);
|
|
|
|
|
|
|
|
/* forking complete and child started to run, tell ptracer */
|
|
|
|
--- a/kernel/sched/core.c
|
|
|
|
+++ b/kernel/sched/core.c
|
2023-03-30 15:39:17 +00:00
|
|
|
@@ -5010,6 +5010,7 @@ context_switch(struct rq *rq, struct tas
|
2023-03-20 21:51:03 +00:00
|
|
|
* finish_task_switch()'s mmdrop().
|
|
|
|
*/
|
|
|
|
switch_mm_irqs_off(prev->active_mm, next->mm, next);
|
|
|
|
+ lru_gen_use_mm(next->mm);
|
|
|
|
|
|
|
|
if (!prev->mm) { // from kernel
|
|
|
|
/* will mmdrop() in finish_task_switch(). */
|
|
|
|
--- a/mm/memcontrol.c
|
|
|
|
+++ b/mm/memcontrol.c
|
|
|
|
@@ -6212,6 +6212,30 @@ static void mem_cgroup_move_task(void)
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
+#ifdef CONFIG_LRU_GEN
|
|
|
|
+static void mem_cgroup_attach(struct cgroup_taskset *tset)
|
|
|
|
+{
|
|
|
|
+ struct task_struct *task;
|
|
|
|
+ struct cgroup_subsys_state *css;
|
|
|
|
+
|
|
|
|
+ /* find the first leader if there is any */
|
|
|
|
+ cgroup_taskset_for_each_leader(task, css, tset)
|
|
|
|
+ break;
|
|
|
|
+
|
|
|
|
+ if (!task)
|
|
|
|
+ return;
|
|
|
|
+
|
|
|
|
+ task_lock(task);
|
|
|
|
+ if (task->mm && READ_ONCE(task->mm->owner) == task)
|
|
|
|
+ lru_gen_migrate_mm(task->mm);
|
|
|
|
+ task_unlock(task);
|
|
|
|
+}
|
|
|
|
+#else
|
|
|
|
+static void mem_cgroup_attach(struct cgroup_taskset *tset)
|
|
|
|
+{
|
|
|
|
+}
|
|
|
|
+#endif /* CONFIG_LRU_GEN */
|
|
|
|
+
|
|
|
|
static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value)
|
|
|
|
{
|
|
|
|
if (value == PAGE_COUNTER_MAX)
|
2023-03-25 16:24:27 +00:00
|
|
|
@@ -6555,6 +6579,7 @@ struct cgroup_subsys memory_cgrp_subsys
|
2023-03-20 21:51:03 +00:00
|
|
|
.css_reset = mem_cgroup_css_reset,
|
|
|
|
.css_rstat_flush = mem_cgroup_css_rstat_flush,
|
|
|
|
.can_attach = mem_cgroup_can_attach,
|
|
|
|
+ .attach = mem_cgroup_attach,
|
|
|
|
.cancel_attach = mem_cgroup_cancel_attach,
|
|
|
|
.post_attach = mem_cgroup_move_task,
|
|
|
|
.dfl_cftypes = memory_files,
|
|
|
|
--- a/mm/vmscan.c
|
|
|
|
+++ b/mm/vmscan.c
|
|
|
|
@@ -50,6 +50,8 @@
|
|
|
|
#include <linux/printk.h>
|
|
|
|
#include <linux/dax.h>
|
|
|
|
#include <linux/psi.h>
|
|
|
|
+#include <linux/pagewalk.h>
|
|
|
|
+#include <linux/shmem_fs.h>
|
|
|
|
|
|
|
|
#include <asm/tlbflush.h>
|
|
|
|
#include <asm/div64.h>
|
2023-03-25 16:24:27 +00:00
|
|
|
@@ -2853,7 +2855,7 @@ static bool can_age_anon_pages(struct pg
|
2023-03-20 21:51:03 +00:00
|
|
|
for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \
|
|
|
|
for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++)
|
|
|
|
|
|
|
|
-static struct lruvec __maybe_unused *get_lruvec(struct mem_cgroup *memcg, int nid)
|
|
|
|
+static struct lruvec *get_lruvec(struct mem_cgroup *memcg, int nid)
|
|
|
|
{
|
|
|
|
struct pglist_data *pgdat = NODE_DATA(nid);
|
|
|
|
|
2023-03-25 16:24:27 +00:00
|
|
|
@@ -2899,6 +2901,371 @@ static bool __maybe_unused seq_is_valid(
|
2023-03-20 21:51:03 +00:00
|
|
|
}
|
|
|
|
|
2023-03-25 16:24:27 +00:00
|
|
|
/******************************************************************************
|
2023-03-20 21:51:03 +00:00
|
|
|
+ * mm_struct list
|
|
|
|
+ ******************************************************************************/
|
|
|
|
+
|
|
|
|
+static struct lru_gen_mm_list *get_mm_list(struct mem_cgroup *memcg)
|
|
|
|
+{
|
|
|
|
+ static struct lru_gen_mm_list mm_list = {
|
|
|
|
+ .fifo = LIST_HEAD_INIT(mm_list.fifo),
|
|
|
|
+ .lock = __SPIN_LOCK_UNLOCKED(mm_list.lock),
|
|
|
|
+ };
|
|
|
|
+
|
|
|
|
+#ifdef CONFIG_MEMCG
|
|
|
|
+ if (memcg)
|
|
|
|
+ return &memcg->mm_list;
|
|
|
|
+#endif
|
|
|
|
+ VM_WARN_ON_ONCE(!mem_cgroup_disabled());
|
|
|
|
+
|
|
|
|
+ return &mm_list;
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+void lru_gen_add_mm(struct mm_struct *mm)
|
|
|
|
+{
|
|
|
|
+ int nid;
|
|
|
|
+ struct mem_cgroup *memcg = get_mem_cgroup_from_mm(mm);
|
|
|
|
+ struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
|
|
|
|
+
|
|
|
|
+ VM_WARN_ON_ONCE(!list_empty(&mm->lru_gen.list));
|
|
|
|
+#ifdef CONFIG_MEMCG
|
|
|
|
+ VM_WARN_ON_ONCE(mm->lru_gen.memcg);
|
|
|
|
+ mm->lru_gen.memcg = memcg;
|
|
|
|
+#endif
|
|
|
|
+ spin_lock(&mm_list->lock);
|
|
|
|
+
|
|
|
|
+ for_each_node_state(nid, N_MEMORY) {
|
|
|
|
+ struct lruvec *lruvec = get_lruvec(memcg, nid);
|
|
|
|
+
|
|
|
|
+ if (!lruvec)
|
|
|
|
+ continue;
|
|
|
|
+
|
|
|
|
+ /* the first addition since the last iteration */
|
|
|
|
+ if (lruvec->mm_state.tail == &mm_list->fifo)
|
|
|
|
+ lruvec->mm_state.tail = &mm->lru_gen.list;
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ list_add_tail(&mm->lru_gen.list, &mm_list->fifo);
|
|
|
|
+
|
|
|
|
+ spin_unlock(&mm_list->lock);
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+void lru_gen_del_mm(struct mm_struct *mm)
|
|
|
|
+{
|
|
|
|
+ int nid;
|
|
|
|
+ struct lru_gen_mm_list *mm_list;
|
|
|
|
+ struct mem_cgroup *memcg = NULL;
|
|
|
|
+
|
|
|
|
+ if (list_empty(&mm->lru_gen.list))
|
|
|
|
+ return;
|
|
|
|
+
|
|
|
|
+#ifdef CONFIG_MEMCG
|
|
|
|
+ memcg = mm->lru_gen.memcg;
|
|
|
|
+#endif
|
|
|
|
+ mm_list = get_mm_list(memcg);
|
|
|
|
+
|
|
|
|
+ spin_lock(&mm_list->lock);
|
|
|
|
+
|
|
|
|
+ for_each_node(nid) {
|
|
|
|
+ struct lruvec *lruvec = get_lruvec(memcg, nid);
|
|
|
|
+
|
|
|
|
+ if (!lruvec)
|
|
|
|
+ continue;
|
|
|
|
+
|
|
|
|
+ /* where the last iteration ended (exclusive) */
|
|
|
|
+ if (lruvec->mm_state.tail == &mm->lru_gen.list)
|
|
|
|
+ lruvec->mm_state.tail = lruvec->mm_state.tail->next;
|
|
|
|
+
|
|
|
|
+ /* where the current iteration continues (inclusive) */
|
|
|
|
+ if (lruvec->mm_state.head != &mm->lru_gen.list)
|
|
|
|
+ continue;
|
|
|
|
+
|
|
|
|
+ lruvec->mm_state.head = lruvec->mm_state.head->next;
|
|
|
|
+ /* the deletion ends the current iteration */
|
|
|
|
+ if (lruvec->mm_state.head == &mm_list->fifo)
|
|
|
|
+ WRITE_ONCE(lruvec->mm_state.seq, lruvec->mm_state.seq + 1);
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ list_del_init(&mm->lru_gen.list);
|
|
|
|
+
|
|
|
|
+ spin_unlock(&mm_list->lock);
|
|
|
|
+
|
|
|
|
+#ifdef CONFIG_MEMCG
|
|
|
|
+ mem_cgroup_put(mm->lru_gen.memcg);
|
|
|
|
+ mm->lru_gen.memcg = NULL;
|
|
|
|
+#endif
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+#ifdef CONFIG_MEMCG
|
|
|
|
+void lru_gen_migrate_mm(struct mm_struct *mm)
|
|
|
|
+{
|
|
|
|
+ struct mem_cgroup *memcg;
|
|
|
|
+ struct task_struct *task = rcu_dereference_protected(mm->owner, true);
|
|
|
|
+
|
|
|
|
+ VM_WARN_ON_ONCE(task->mm != mm);
|
|
|
|
+ lockdep_assert_held(&task->alloc_lock);
|
|
|
|
+
|
|
|
|
+ /* for mm_update_next_owner() */
|
|
|
|
+ if (mem_cgroup_disabled())
|
|
|
|
+ return;
|
|
|
|
+
|
|
|
|
+ rcu_read_lock();
|
|
|
|
+ memcg = mem_cgroup_from_task(task);
|
|
|
|
+ rcu_read_unlock();
|
|
|
|
+ if (memcg == mm->lru_gen.memcg)
|
|
|
|
+ return;
|
|
|
|
+
|
|
|
|
+ VM_WARN_ON_ONCE(!mm->lru_gen.memcg);
|
|
|
|
+ VM_WARN_ON_ONCE(list_empty(&mm->lru_gen.list));
|
|
|
|
+
|
|
|
|
+ lru_gen_del_mm(mm);
|
|
|
|
+ lru_gen_add_mm(mm);
|
|
|
|
+}
|
|
|
|
+#endif
|
|
|
|
+
|
|
|
|
+/*
|
|
|
|
+ * Bloom filters with m=1<<15, k=2 and the false positive rates of ~1/5 when
|
|
|
|
+ * n=10,000 and ~1/2 when n=20,000, where, conventionally, m is the number of
|
|
|
|
+ * bits in a bitmap, k is the number of hash functions and n is the number of
|
|
|
|
+ * inserted items.
|
|
|
|
+ *
|
|
|
|
+ * Page table walkers use one of the two filters to reduce their search space.
|
|
|
|
+ * To get rid of non-leaf entries that no longer have enough leaf entries, the
|
|
|
|
+ * aging uses the double-buffering technique to flip to the other filter each
|
|
|
|
+ * time it produces a new generation. For non-leaf entries that have enough
|
|
|
|
+ * leaf entries, the aging carries them over to the next generation in
|
|
|
|
+ * walk_pmd_range(); the eviction also report them when walking the rmap
|
|
|
|
+ * in lru_gen_look_around().
|
|
|
|
+ *
|
|
|
|
+ * For future optimizations:
|
|
|
|
+ * 1. It's not necessary to keep both filters all the time. The spare one can be
|
|
|
|
+ * freed after the RCU grace period and reallocated if needed again.
|
|
|
|
+ * 2. And when reallocating, it's worth scaling its size according to the number
|
|
|
|
+ * of inserted entries in the other filter, to reduce the memory overhead on
|
|
|
|
+ * small systems and false positives on large systems.
|
|
|
|
+ * 3. Jenkins' hash function is an alternative to Knuth's.
|
|
|
|
+ */
|
|
|
|
+#define BLOOM_FILTER_SHIFT 15
|
|
|
|
+
|
|
|
|
+static inline int filter_gen_from_seq(unsigned long seq)
|
|
|
|
+{
|
|
|
|
+ return seq % NR_BLOOM_FILTERS;
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+static void get_item_key(void *item, int *key)
|
|
|
|
+{
|
|
|
|
+ u32 hash = hash_ptr(item, BLOOM_FILTER_SHIFT * 2);
|
|
|
|
+
|
|
|
|
+ BUILD_BUG_ON(BLOOM_FILTER_SHIFT * 2 > BITS_PER_TYPE(u32));
|
|
|
|
+
|
|
|
|
+ key[0] = hash & (BIT(BLOOM_FILTER_SHIFT) - 1);
|
|
|
|
+ key[1] = hash >> BLOOM_FILTER_SHIFT;
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+static void reset_bloom_filter(struct lruvec *lruvec, unsigned long seq)
|
|
|
|
+{
|
|
|
|
+ unsigned long *filter;
|
|
|
|
+ int gen = filter_gen_from_seq(seq);
|
|
|
|
+
|
|
|
|
+ filter = lruvec->mm_state.filters[gen];
|
|
|
|
+ if (filter) {
|
|
|
|
+ bitmap_clear(filter, 0, BIT(BLOOM_FILTER_SHIFT));
|
|
|
|
+ return;
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ filter = bitmap_zalloc(BIT(BLOOM_FILTER_SHIFT),
|
|
|
|
+ __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN);
|
|
|
|
+ WRITE_ONCE(lruvec->mm_state.filters[gen], filter);
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+static void update_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item)
|
|
|
|
+{
|
|
|
|
+ int key[2];
|
|
|
|
+ unsigned long *filter;
|
|
|
|
+ int gen = filter_gen_from_seq(seq);
|
|
|
|
+
|
|
|
|
+ filter = READ_ONCE(lruvec->mm_state.filters[gen]);
|
|
|
|
+ if (!filter)
|
|
|
|
+ return;
|
|
|
|
+
|
|
|
|
+ get_item_key(item, key);
|
|
|
|
+
|
|
|
|
+ if (!test_bit(key[0], filter))
|
|
|
|
+ set_bit(key[0], filter);
|
|
|
|
+ if (!test_bit(key[1], filter))
|
|
|
|
+ set_bit(key[1], filter);
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+static bool test_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item)
|
|
|
|
+{
|
|
|
|
+ int key[2];
|
|
|
|
+ unsigned long *filter;
|
|
|
|
+ int gen = filter_gen_from_seq(seq);
|
|
|
|
+
|
|
|
|
+ filter = READ_ONCE(lruvec->mm_state.filters[gen]);
|
|
|
|
+ if (!filter)
|
|
|
|
+ return true;
|
|
|
|
+
|
|
|
|
+ get_item_key(item, key);
|
|
|
|
+
|
|
|
|
+ return test_bit(key[0], filter) && test_bit(key[1], filter);
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+static void reset_mm_stats(struct lruvec *lruvec, struct lru_gen_mm_walk *walk, bool last)
|
|
|
|
+{
|
|
|
|
+ int i;
|
|
|
|
+ int hist;
|
|
|
|
+
|
|
|
|
+ lockdep_assert_held(&get_mm_list(lruvec_memcg(lruvec))->lock);
|
|
|
|
+
|
|
|
|
+ if (walk) {
|
|
|
|
+ hist = lru_hist_from_seq(walk->max_seq);
|
|
|
|
+
|
|
|
|
+ for (i = 0; i < NR_MM_STATS; i++) {
|
|
|
|
+ WRITE_ONCE(lruvec->mm_state.stats[hist][i],
|
|
|
|
+ lruvec->mm_state.stats[hist][i] + walk->mm_stats[i]);
|
|
|
|
+ walk->mm_stats[i] = 0;
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ if (NR_HIST_GENS > 1 && last) {
|
|
|
|
+ hist = lru_hist_from_seq(lruvec->mm_state.seq + 1);
|
|
|
|
+
|
|
|
|
+ for (i = 0; i < NR_MM_STATS; i++)
|
|
|
|
+ WRITE_ONCE(lruvec->mm_state.stats[hist][i], 0);
|
|
|
|
+ }
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+static bool should_skip_mm(struct mm_struct *mm, struct lru_gen_mm_walk *walk)
|
|
|
|
+{
|
|
|
|
+ int type;
|
|
|
|
+ unsigned long size = 0;
|
|
|
|
+ struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
|
|
|
|
+ int key = pgdat->node_id % BITS_PER_TYPE(mm->lru_gen.bitmap);
|
|
|
|
+
|
|
|
|
+ if (!walk->force_scan && !test_bit(key, &mm->lru_gen.bitmap))
|
|
|
|
+ return true;
|
|
|
|
+
|
|
|
|
+ clear_bit(key, &mm->lru_gen.bitmap);
|
|
|
|
+
|
|
|
|
+ for (type = !walk->can_swap; type < ANON_AND_FILE; type++) {
|
|
|
|
+ size += type ? get_mm_counter(mm, MM_FILEPAGES) :
|
|
|
|
+ get_mm_counter(mm, MM_ANONPAGES) +
|
|
|
|
+ get_mm_counter(mm, MM_SHMEMPAGES);
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ if (size < MIN_LRU_BATCH)
|
|
|
|
+ return true;
|
|
|
|
+
|
|
|
|
+ return !mmget_not_zero(mm);
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+static bool iterate_mm_list(struct lruvec *lruvec, struct lru_gen_mm_walk *walk,
|
|
|
|
+ struct mm_struct **iter)
|
|
|
|
+{
|
|
|
|
+ bool first = false;
|
|
|
|
+ bool last = true;
|
|
|
|
+ struct mm_struct *mm = NULL;
|
|
|
|
+ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
|
|
|
|
+ struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
|
|
|
|
+ struct lru_gen_mm_state *mm_state = &lruvec->mm_state;
|
|
|
|
+
|
|
|
|
+ /*
|
|
|
|
+ * There are four interesting cases for this page table walker:
|
|
|
|
+ * 1. It tries to start a new iteration of mm_list with a stale max_seq;
|
|
|
|
+ * there is nothing left to do.
|
|
|
|
+ * 2. It's the first of the current generation, and it needs to reset
|
|
|
|
+ * the Bloom filter for the next generation.
|
|
|
|
+ * 3. It reaches the end of mm_list, and it needs to increment
|
|
|
|
+ * mm_state->seq; the iteration is done.
|
|
|
|
+ * 4. It's the last of the current generation, and it needs to reset the
|
|
|
|
+ * mm stats counters for the next generation.
|
|
|
|
+ */
|
|
|
|
+ spin_lock(&mm_list->lock);
|
|
|
|
+
|
|
|
|
+ VM_WARN_ON_ONCE(mm_state->seq + 1 < walk->max_seq);
|
|
|
|
+ VM_WARN_ON_ONCE(*iter && mm_state->seq > walk->max_seq);
|
|
|
|
+ VM_WARN_ON_ONCE(*iter && !mm_state->nr_walkers);
|
|
|
|
+
|
|
|
|
+ if (walk->max_seq <= mm_state->seq) {
|
|
|
|
+ if (!*iter)
|
|
|
|
+ last = false;
|
|
|
|
+ goto done;
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ if (!mm_state->nr_walkers) {
|
|
|
|
+ VM_WARN_ON_ONCE(mm_state->head && mm_state->head != &mm_list->fifo);
|
|
|
|
+
|
|
|
|
+ mm_state->head = mm_list->fifo.next;
|
|
|
|
+ first = true;
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ while (!mm && mm_state->head != &mm_list->fifo) {
|
|
|
|
+ mm = list_entry(mm_state->head, struct mm_struct, lru_gen.list);
|
|
|
|
+
|
|
|
|
+ mm_state->head = mm_state->head->next;
|
|
|
|
+
|
|
|
|
+ /* force scan for those added after the last iteration */
|
|
|
|
+ if (!mm_state->tail || mm_state->tail == &mm->lru_gen.list) {
|
|
|
|
+ mm_state->tail = mm_state->head;
|
|
|
|
+ walk->force_scan = true;
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ if (should_skip_mm(mm, walk))
|
|
|
|
+ mm = NULL;
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ if (mm_state->head == &mm_list->fifo)
|
|
|
|
+ WRITE_ONCE(mm_state->seq, mm_state->seq + 1);
|
|
|
|
+done:
|
|
|
|
+ if (*iter && !mm)
|
|
|
|
+ mm_state->nr_walkers--;
|
|
|
|
+ if (!*iter && mm)
|
|
|
|
+ mm_state->nr_walkers++;
|
|
|
|
+
|
|
|
|
+ if (mm_state->nr_walkers)
|
|
|
|
+ last = false;
|
|
|
|
+
|
|
|
|
+ if (*iter || last)
|
|
|
|
+ reset_mm_stats(lruvec, walk, last);
|
|
|
|
+
|
|
|
|
+ spin_unlock(&mm_list->lock);
|
|
|
|
+
|
|
|
|
+ if (mm && first)
|
|
|
|
+ reset_bloom_filter(lruvec, walk->max_seq + 1);
|
|
|
|
+
|
|
|
|
+ if (*iter)
|
|
|
|
+ mmput_async(*iter);
|
|
|
|
+
|
|
|
|
+ *iter = mm;
|
|
|
|
+
|
|
|
|
+ return last;
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+static bool iterate_mm_list_nowalk(struct lruvec *lruvec, unsigned long max_seq)
|
|
|
|
+{
|
|
|
|
+ bool success = false;
|
|
|
|
+ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
|
|
|
|
+ struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
|
|
|
|
+ struct lru_gen_mm_state *mm_state = &lruvec->mm_state;
|
|
|
|
+
|
|
|
|
+ spin_lock(&mm_list->lock);
|
|
|
|
+
|
|
|
|
+ VM_WARN_ON_ONCE(mm_state->seq + 1 < max_seq);
|
|
|
|
+
|
|
|
|
+ if (max_seq > mm_state->seq && !mm_state->nr_walkers) {
|
|
|
|
+ VM_WARN_ON_ONCE(mm_state->head && mm_state->head != &mm_list->fifo);
|
|
|
|
+
|
|
|
|
+ WRITE_ONCE(mm_state->seq, mm_state->seq + 1);
|
|
|
|
+ reset_mm_stats(lruvec, NULL, true);
|
|
|
|
+ success = true;
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ spin_unlock(&mm_list->lock);
|
|
|
|
+
|
|
|
|
+ return success;
|
|
|
|
+}
|
|
|
|
+
|
2023-03-25 16:24:27 +00:00
|
|
|
+/******************************************************************************
|
2023-03-20 21:51:03 +00:00
|
|
|
* refault feedback loop
|
|
|
|
******************************************************************************/
|
2023-03-25 16:24:27 +00:00
|
|
|
|
|
|
|
@@ -3048,6 +3415,118 @@ static int page_inc_gen(struct lruvec *l
|
2023-03-20 21:51:03 +00:00
|
|
|
return new_gen;
|
|
|
|
}
|
|
|
|
|
|
|
|
+static void update_batch_size(struct lru_gen_mm_walk *walk, struct page *page,
|
|
|
|
+ int old_gen, int new_gen)
|
|
|
|
+{
|
|
|
|
+ int type = page_is_file_lru(page);
|
|
|
|
+ int zone = page_zonenum(page);
|
|
|
|
+ int delta = thp_nr_pages(page);
|
|
|
|
+
|
|
|
|
+ VM_WARN_ON_ONCE(old_gen >= MAX_NR_GENS);
|
|
|
|
+ VM_WARN_ON_ONCE(new_gen >= MAX_NR_GENS);
|
|
|
|
+
|
|
|
|
+ walk->batched++;
|
|
|
|
+
|
|
|
|
+ walk->nr_pages[old_gen][type][zone] -= delta;
|
|
|
|
+ walk->nr_pages[new_gen][type][zone] += delta;
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+static void reset_batch_size(struct lruvec *lruvec, struct lru_gen_mm_walk *walk)
|
|
|
|
+{
|
|
|
|
+ int gen, type, zone;
|
|
|
|
+ struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
|
|
|
+
|
|
|
|
+ walk->batched = 0;
|
|
|
|
+
|
|
|
|
+ for_each_gen_type_zone(gen, type, zone) {
|
|
|
|
+ enum lru_list lru = type * LRU_INACTIVE_FILE;
|
|
|
|
+ int delta = walk->nr_pages[gen][type][zone];
|
|
|
|
+
|
|
|
|
+ if (!delta)
|
|
|
|
+ continue;
|
|
|
|
+
|
|
|
|
+ walk->nr_pages[gen][type][zone] = 0;
|
|
|
|
+ WRITE_ONCE(lrugen->nr_pages[gen][type][zone],
|
|
|
|
+ lrugen->nr_pages[gen][type][zone] + delta);
|
|
|
|
+
|
|
|
|
+ if (lru_gen_is_active(lruvec, gen))
|
|
|
|
+ lru += LRU_ACTIVE;
|
|
|
|
+ __update_lru_size(lruvec, lru, zone, delta);
|
|
|
|
+ }
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+static int should_skip_vma(unsigned long start, unsigned long end, struct mm_walk *args)
|
|
|
|
+{
|
|
|
|
+ struct address_space *mapping;
|
|
|
|
+ struct vm_area_struct *vma = args->vma;
|
|
|
|
+ struct lru_gen_mm_walk *walk = args->private;
|
|
|
|
+
|
|
|
|
+ if (!vma_is_accessible(vma))
|
|
|
|
+ return true;
|
|
|
|
+
|
|
|
|
+ if (is_vm_hugetlb_page(vma))
|
|
|
|
+ return true;
|
|
|
|
+
|
|
|
|
+ if (vma->vm_flags & (VM_LOCKED | VM_SPECIAL | VM_SEQ_READ | VM_RAND_READ))
|
|
|
|
+ return true;
|
|
|
|
+
|
|
|
|
+ if (vma == get_gate_vma(vma->vm_mm))
|
|
|
|
+ return true;
|
|
|
|
+
|
|
|
|
+ if (vma_is_anonymous(vma))
|
|
|
|
+ return !walk->can_swap;
|
|
|
|
+
|
|
|
|
+ if (WARN_ON_ONCE(!vma->vm_file || !vma->vm_file->f_mapping))
|
|
|
|
+ return true;
|
|
|
|
+
|
|
|
|
+ mapping = vma->vm_file->f_mapping;
|
|
|
|
+ if (mapping_unevictable(mapping))
|
|
|
|
+ return true;
|
|
|
|
+
|
|
|
|
+ if (shmem_mapping(mapping))
|
|
|
|
+ return !walk->can_swap;
|
|
|
|
+
|
|
|
|
+ /* to exclude special mappings like dax, etc. */
|
|
|
|
+ return !mapping->a_ops->readpage;
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+/*
|
|
|
|
+ * Some userspace memory allocators map many single-page VMAs. Instead of
|
|
|
|
+ * returning back to the PGD table for each of such VMAs, finish an entire PMD
|
|
|
|
+ * table to reduce zigzags and improve cache performance.
|
|
|
|
+ */
|
|
|
|
+static bool get_next_vma(unsigned long mask, unsigned long size, struct mm_walk *args,
|
|
|
|
+ unsigned long *vm_start, unsigned long *vm_end)
|
|
|
|
+{
|
|
|
|
+ unsigned long start = round_up(*vm_end, size);
|
|
|
|
+ unsigned long end = (start | ~mask) + 1;
|
|
|
|
+
|
|
|
|
+ VM_WARN_ON_ONCE(mask & size);
|
|
|
|
+ VM_WARN_ON_ONCE((start & mask) != (*vm_start & mask));
|
|
|
|
+
|
|
|
|
+ while (args->vma) {
|
|
|
|
+ if (start >= args->vma->vm_end) {
|
|
|
|
+ args->vma = args->vma->vm_next;
|
|
|
|
+ continue;
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ if (end && end <= args->vma->vm_start)
|
|
|
|
+ return false;
|
|
|
|
+
|
|
|
|
+ if (should_skip_vma(args->vma->vm_start, args->vma->vm_end, args)) {
|
|
|
|
+ args->vma = args->vma->vm_next;
|
|
|
|
+ continue;
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ *vm_start = max(start, args->vma->vm_start);
|
|
|
|
+ *vm_end = min(end - 1, args->vma->vm_end - 1) + 1;
|
|
|
|
+
|
|
|
|
+ return true;
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ return false;
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned long addr)
|
|
|
|
{
|
|
|
|
unsigned long pfn = pte_pfn(pte);
|
2023-03-25 16:24:27 +00:00
|
|
|
@@ -3066,8 +3545,28 @@ static unsigned long get_pte_pfn(pte_t p
|
2023-03-20 21:51:03 +00:00
|
|
|
return pfn;
|
|
|
|
}
|
|
|
|
|
|
|
|
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
|
|
|
|
+static unsigned long get_pmd_pfn(pmd_t pmd, struct vm_area_struct *vma, unsigned long addr)
|
|
|
|
+{
|
|
|
|
+ unsigned long pfn = pmd_pfn(pmd);
|
|
|
|
+
|
|
|
|
+ VM_WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end);
|
|
|
|
+
|
|
|
|
+ if (!pmd_present(pmd) || is_huge_zero_pmd(pmd))
|
|
|
|
+ return -1;
|
|
|
|
+
|
|
|
|
+ if (WARN_ON_ONCE(pmd_devmap(pmd)))
|
|
|
|
+ return -1;
|
|
|
|
+
|
|
|
|
+ if (WARN_ON_ONCE(!pfn_valid(pfn)))
|
|
|
|
+ return -1;
|
|
|
|
+
|
|
|
|
+ return pfn;
|
|
|
|
+}
|
|
|
|
+#endif
|
|
|
|
+
|
|
|
|
static struct page *get_pfn_page(unsigned long pfn, struct mem_cgroup *memcg,
|
|
|
|
- struct pglist_data *pgdat)
|
|
|
|
+ struct pglist_data *pgdat, bool can_swap)
|
|
|
|
{
|
|
|
|
struct page *page;
|
|
|
|
|
2023-03-25 16:24:27 +00:00
|
|
|
@@ -3082,9 +3581,375 @@ static struct page *get_pfn_page(unsigne
|
2023-03-20 21:51:03 +00:00
|
|
|
if (page_memcg_rcu(page) != memcg)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
+ /* file VMAs can contain anon pages from COW */
|
|
|
|
+ if (!page_is_file_lru(page) && !can_swap)
|
|
|
|
+ return NULL;
|
|
|
|
+
|
|
|
|
return page;
|
|
|
|
}
|
|
|
|
|
|
|
|
+static bool suitable_to_scan(int total, int young)
|
|
|
|
+{
|
|
|
|
+ int n = clamp_t(int, cache_line_size() / sizeof(pte_t), 2, 8);
|
|
|
|
+
|
|
|
|
+ /* suitable if the average number of young PTEs per cacheline is >=1 */
|
|
|
|
+ return young * n >= total;
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end,
|
|
|
|
+ struct mm_walk *args)
|
|
|
|
+{
|
|
|
|
+ int i;
|
|
|
|
+ pte_t *pte;
|
|
|
|
+ spinlock_t *ptl;
|
|
|
|
+ unsigned long addr;
|
|
|
|
+ int total = 0;
|
|
|
|
+ int young = 0;
|
|
|
|
+ struct lru_gen_mm_walk *walk = args->private;
|
|
|
|
+ struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec);
|
|
|
|
+ struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
|
|
|
|
+ int old_gen, new_gen = lru_gen_from_seq(walk->max_seq);
|
|
|
|
+
|
|
|
|
+ VM_WARN_ON_ONCE(pmd_leaf(*pmd));
|
|
|
|
+
|
|
|
|
+ ptl = pte_lockptr(args->mm, pmd);
|
|
|
|
+ if (!spin_trylock(ptl))
|
|
|
|
+ return false;
|
|
|
|
+
|
|
|
|
+ arch_enter_lazy_mmu_mode();
|
|
|
|
+
|
|
|
|
+ pte = pte_offset_map(pmd, start & PMD_MASK);
|
|
|
|
+restart:
|
|
|
|
+ for (i = pte_index(start), addr = start; addr != end; i++, addr += PAGE_SIZE) {
|
|
|
|
+ unsigned long pfn;
|
|
|
|
+ struct page *page;
|
|
|
|
+
|
|
|
|
+ total++;
|
|
|
|
+ walk->mm_stats[MM_LEAF_TOTAL]++;
|
|
|
|
+
|
|
|
|
+ pfn = get_pte_pfn(pte[i], args->vma, addr);
|
|
|
|
+ if (pfn == -1)
|
|
|
|
+ continue;
|
|
|
|
+
|
|
|
|
+ if (!pte_young(pte[i])) {
|
|
|
|
+ walk->mm_stats[MM_LEAF_OLD]++;
|
|
|
|
+ continue;
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ page = get_pfn_page(pfn, memcg, pgdat, walk->can_swap);
|
|
|
|
+ if (!page)
|
|
|
|
+ continue;
|
|
|
|
+
|
|
|
|
+ if (!ptep_test_and_clear_young(args->vma, addr, pte + i))
|
|
|
|
+ VM_WARN_ON_ONCE(true);
|
|
|
|
+
|
|
|
|
+ young++;
|
|
|
|
+ walk->mm_stats[MM_LEAF_YOUNG]++;
|
|
|
|
+
|
|
|
|
+ if (pte_dirty(pte[i]) && !PageDirty(page) &&
|
|
|
|
+ !(PageAnon(page) && PageSwapBacked(page) &&
|
|
|
|
+ !PageSwapCache(page)))
|
|
|
|
+ set_page_dirty(page);
|
|
|
|
+
|
|
|
|
+ old_gen = page_update_gen(page, new_gen);
|
|
|
|
+ if (old_gen >= 0 && old_gen != new_gen)
|
|
|
|
+ update_batch_size(walk, page, old_gen, new_gen);
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ if (i < PTRS_PER_PTE && get_next_vma(PMD_MASK, PAGE_SIZE, args, &start, &end))
|
|
|
|
+ goto restart;
|
|
|
|
+
|
|
|
|
+ pte_unmap(pte);
|
|
|
|
+
|
|
|
|
+ arch_leave_lazy_mmu_mode();
|
|
|
|
+ spin_unlock(ptl);
|
|
|
|
+
|
|
|
|
+ return suitable_to_scan(total, young);
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
|
|
|
|
+static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area_struct *vma,
|
|
|
|
+ struct mm_walk *args, unsigned long *bitmap, unsigned long *start)
|
|
|
|
+{
|
|
|
|
+ int i;
|
|
|
|
+ pmd_t *pmd;
|
|
|
|
+ spinlock_t *ptl;
|
|
|
|
+ struct lru_gen_mm_walk *walk = args->private;
|
|
|
|
+ struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec);
|
|
|
|
+ struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
|
|
|
|
+ int old_gen, new_gen = lru_gen_from_seq(walk->max_seq);
|
|
|
|
+
|
|
|
|
+ VM_WARN_ON_ONCE(pud_leaf(*pud));
|
|
|
|
+
|
|
|
|
+ /* try to batch at most 1+MIN_LRU_BATCH+1 entries */
|
|
|
|
+ if (*start == -1) {
|
|
|
|
+ *start = next;
|
|
|
|
+ return;
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ i = next == -1 ? 0 : pmd_index(next) - pmd_index(*start);
|
|
|
|
+ if (i && i <= MIN_LRU_BATCH) {
|
|
|
|
+ __set_bit(i - 1, bitmap);
|
|
|
|
+ return;
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ pmd = pmd_offset(pud, *start);
|
|
|
|
+
|
|
|
|
+ ptl = pmd_lockptr(args->mm, pmd);
|
|
|
|
+ if (!spin_trylock(ptl))
|
|
|
|
+ goto done;
|
|
|
|
+
|
|
|
|
+ arch_enter_lazy_mmu_mode();
|
|
|
|
+
|
|
|
|
+ do {
|
|
|
|
+ unsigned long pfn;
|
|
|
|
+ struct page *page;
|
|
|
|
+ unsigned long addr = i ? (*start & PMD_MASK) + i * PMD_SIZE : *start;
|
|
|
|
+
|
|
|
|
+ pfn = get_pmd_pfn(pmd[i], vma, addr);
|
|
|
|
+ if (pfn == -1)
|
|
|
|
+ goto next;
|
|
|
|
+
|
|
|
|
+ if (!pmd_trans_huge(pmd[i])) {
|
|
|
|
+ if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG))
|
|
|
|
+ pmdp_test_and_clear_young(vma, addr, pmd + i);
|
|
|
|
+ goto next;
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ page = get_pfn_page(pfn, memcg, pgdat, walk->can_swap);
|
|
|
|
+ if (!page)
|
|
|
|
+ goto next;
|
|
|
|
+
|
|
|
|
+ if (!pmdp_test_and_clear_young(vma, addr, pmd + i))
|
|
|
|
+ goto next;
|
|
|
|
+
|
|
|
|
+ walk->mm_stats[MM_LEAF_YOUNG]++;
|
|
|
|
+
|
|
|
|
+ if (pmd_dirty(pmd[i]) && !PageDirty(page) &&
|
|
|
|
+ !(PageAnon(page) && PageSwapBacked(page) &&
|
|
|
|
+ !PageSwapCache(page)))
|
|
|
|
+ set_page_dirty(page);
|
|
|
|
+
|
|
|
|
+ old_gen = page_update_gen(page, new_gen);
|
|
|
|
+ if (old_gen >= 0 && old_gen != new_gen)
|
|
|
|
+ update_batch_size(walk, page, old_gen, new_gen);
|
|
|
|
+next:
|
|
|
|
+ i = i > MIN_LRU_BATCH ? 0 : find_next_bit(bitmap, MIN_LRU_BATCH, i) + 1;
|
|
|
|
+ } while (i <= MIN_LRU_BATCH);
|
|
|
|
+
|
|
|
|
+ arch_leave_lazy_mmu_mode();
|
|
|
|
+ spin_unlock(ptl);
|
|
|
|
+done:
|
|
|
|
+ *start = -1;
|
|
|
|
+ bitmap_zero(bitmap, MIN_LRU_BATCH);
|
|
|
|
+}
|
|
|
|
+#else
|
|
|
|
+static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area_struct *vma,
|
|
|
|
+ struct mm_walk *args, unsigned long *bitmap, unsigned long *start)
|
|
|
|
+{
|
|
|
|
+}
|
|
|
|
+#endif
|
|
|
|
+
|
|
|
|
+static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end,
|
|
|
|
+ struct mm_walk *args)
|
|
|
|
+{
|
|
|
|
+ int i;
|
|
|
|
+ pmd_t *pmd;
|
|
|
|
+ unsigned long next;
|
|
|
|
+ unsigned long addr;
|
|
|
|
+ struct vm_area_struct *vma;
|
|
|
|
+ unsigned long pos = -1;
|
|
|
|
+ struct lru_gen_mm_walk *walk = args->private;
|
|
|
|
+ unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)] = {};
|
|
|
|
+
|
|
|
|
+ VM_WARN_ON_ONCE(pud_leaf(*pud));
|
|
|
|
+
|
|
|
|
+ /*
|
|
|
|
+ * Finish an entire PMD in two passes: the first only reaches to PTE
|
|
|
|
+ * tables to avoid taking the PMD lock; the second, if necessary, takes
|
|
|
|
+ * the PMD lock to clear the accessed bit in PMD entries.
|
|
|
|
+ */
|
|
|
|
+ pmd = pmd_offset(pud, start & PUD_MASK);
|
|
|
|
+restart:
|
|
|
|
+ /* walk_pte_range() may call get_next_vma() */
|
|
|
|
+ vma = args->vma;
|
|
|
|
+ for (i = pmd_index(start), addr = start; addr != end; i++, addr = next) {
|
|
|
|
+ pmd_t val = pmd_read_atomic(pmd + i);
|
|
|
|
+
|
|
|
|
+ /* for pmd_read_atomic() */
|
|
|
|
+ barrier();
|
|
|
|
+
|
|
|
|
+ next = pmd_addr_end(addr, end);
|
|
|
|
+
|
|
|
|
+ if (!pmd_present(val) || is_huge_zero_pmd(val)) {
|
|
|
|
+ walk->mm_stats[MM_LEAF_TOTAL]++;
|
|
|
|
+ continue;
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
|
|
|
+ if (pmd_trans_huge(val)) {
|
|
|
|
+ unsigned long pfn = pmd_pfn(val);
|
|
|
|
+ struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
|
|
|
|
+
|
|
|
|
+ walk->mm_stats[MM_LEAF_TOTAL]++;
|
|
|
|
+
|
|
|
|
+ if (!pmd_young(val)) {
|
|
|
|
+ walk->mm_stats[MM_LEAF_OLD]++;
|
|
|
|
+ continue;
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ /* try to avoid unnecessary memory loads */
|
|
|
|
+ if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat))
|
|
|
|
+ continue;
|
|
|
|
+
|
|
|
|
+ walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos);
|
|
|
|
+ continue;
|
|
|
|
+ }
|
|
|
|
+#endif
|
|
|
|
+ walk->mm_stats[MM_NONLEAF_TOTAL]++;
|
|
|
|
+
|
|
|
|
+#ifdef CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG
|
|
|
|
+ if (!pmd_young(val))
|
|
|
|
+ continue;
|
|
|
|
+
|
|
|
|
+ walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos);
|
|
|
|
+#endif
|
|
|
|
+ if (!walk->force_scan && !test_bloom_filter(walk->lruvec, walk->max_seq, pmd + i))
|
|
|
|
+ continue;
|
|
|
|
+
|
|
|
|
+ walk->mm_stats[MM_NONLEAF_FOUND]++;
|
|
|
|
+
|
|
|
|
+ if (!walk_pte_range(&val, addr, next, args))
|
|
|
|
+ continue;
|
|
|
|
+
|
|
|
|
+ walk->mm_stats[MM_NONLEAF_ADDED]++;
|
|
|
|
+
|
|
|
|
+ /* carry over to the next generation */
|
|
|
|
+ update_bloom_filter(walk->lruvec, walk->max_seq + 1, pmd + i);
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ walk_pmd_range_locked(pud, -1, vma, args, bitmap, &pos);
|
|
|
|
+
|
|
|
|
+ if (i < PTRS_PER_PMD && get_next_vma(PUD_MASK, PMD_SIZE, args, &start, &end))
|
|
|
|
+ goto restart;
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+static int walk_pud_range(p4d_t *p4d, unsigned long start, unsigned long end,
|
|
|
|
+ struct mm_walk *args)
|
|
|
|
+{
|
|
|
|
+ int i;
|
|
|
|
+ pud_t *pud;
|
|
|
|
+ unsigned long addr;
|
|
|
|
+ unsigned long next;
|
|
|
|
+ struct lru_gen_mm_walk *walk = args->private;
|
|
|
|
+
|
|
|
|
+ VM_WARN_ON_ONCE(p4d_leaf(*p4d));
|
|
|
|
+
|
|
|
|
+ pud = pud_offset(p4d, start & P4D_MASK);
|
|
|
|
+restart:
|
|
|
|
+ for (i = pud_index(start), addr = start; addr != end; i++, addr = next) {
|
|
|
|
+ pud_t val = READ_ONCE(pud[i]);
|
|
|
|
+
|
|
|
|
+ next = pud_addr_end(addr, end);
|
|
|
|
+
|
|
|
|
+ if (!pud_present(val) || WARN_ON_ONCE(pud_leaf(val)))
|
|
|
|
+ continue;
|
|
|
|
+
|
|
|
|
+ walk_pmd_range(&val, addr, next, args);
|
|
|
|
+
|
|
|
|
+ /* a racy check to curtail the waiting time */
|
|
|
|
+ if (wq_has_sleeper(&walk->lruvec->mm_state.wait))
|
|
|
|
+ return 1;
|
|
|
|
+
|
|
|
|
+ if (need_resched() || walk->batched >= MAX_LRU_BATCH) {
|
|
|
|
+ end = (addr | ~PUD_MASK) + 1;
|
|
|
|
+ goto done;
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ if (i < PTRS_PER_PUD && get_next_vma(P4D_MASK, PUD_SIZE, args, &start, &end))
|
|
|
|
+ goto restart;
|
|
|
|
+
|
|
|
|
+ end = round_up(end, P4D_SIZE);
|
|
|
|
+done:
|
|
|
|
+ if (!end || !args->vma)
|
|
|
|
+ return 1;
|
|
|
|
+
|
|
|
|
+ walk->next_addr = max(end, args->vma->vm_start);
|
|
|
|
+
|
|
|
|
+ return -EAGAIN;
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+static void walk_mm(struct lruvec *lruvec, struct mm_struct *mm, struct lru_gen_mm_walk *walk)
|
|
|
|
+{
|
|
|
|
+ static const struct mm_walk_ops mm_walk_ops = {
|
|
|
|
+ .test_walk = should_skip_vma,
|
|
|
|
+ .p4d_entry = walk_pud_range,
|
|
|
|
+ };
|
|
|
|
+
|
|
|
|
+ int err;
|
|
|
|
+ struct mem_cgroup *memcg = lruvec_memcg(lruvec);
|
|
|
|
+
|
|
|
|
+ walk->next_addr = FIRST_USER_ADDRESS;
|
|
|
|
+
|
|
|
|
+ do {
|
|
|
|
+ err = -EBUSY;
|
|
|
|
+
|
|
|
|
+ /* page_update_gen() requires stable page_memcg() */
|
|
|
|
+ if (!mem_cgroup_trylock_pages(memcg))
|
|
|
|
+ break;
|
|
|
|
+
|
|
|
|
+ /* the caller might be holding the lock for write */
|
|
|
|
+ if (mmap_read_trylock(mm)) {
|
|
|
|
+ err = walk_page_range(mm, walk->next_addr, ULONG_MAX, &mm_walk_ops, walk);
|
|
|
|
+
|
|
|
|
+ mmap_read_unlock(mm);
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ mem_cgroup_unlock_pages();
|
|
|
|
+
|
|
|
|
+ if (walk->batched) {
|
|
|
|
+ spin_lock_irq(&lruvec->lru_lock);
|
|
|
|
+ reset_batch_size(lruvec, walk);
|
|
|
|
+ spin_unlock_irq(&lruvec->lru_lock);
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ cond_resched();
|
|
|
|
+ } while (err == -EAGAIN);
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+static struct lru_gen_mm_walk *set_mm_walk(struct pglist_data *pgdat)
|
|
|
|
+{
|
|
|
|
+ struct lru_gen_mm_walk *walk = current->reclaim_state->mm_walk;
|
|
|
|
+
|
|
|
|
+ if (pgdat && current_is_kswapd()) {
|
|
|
|
+ VM_WARN_ON_ONCE(walk);
|
|
|
|
+
|
|
|
|
+ walk = &pgdat->mm_walk;
|
|
|
|
+ } else if (!pgdat && !walk) {
|
|
|
|
+ VM_WARN_ON_ONCE(current_is_kswapd());
|
|
|
|
+
|
|
|
|
+ walk = kzalloc(sizeof(*walk), __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN);
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ current->reclaim_state->mm_walk = walk;
|
|
|
|
+
|
|
|
|
+ return walk;
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+static void clear_mm_walk(void)
|
|
|
|
+{
|
|
|
|
+ struct lru_gen_mm_walk *walk = current->reclaim_state->mm_walk;
|
|
|
|
+
|
|
|
|
+ VM_WARN_ON_ONCE(walk && memchr_inv(walk->nr_pages, 0, sizeof(walk->nr_pages)));
|
|
|
|
+ VM_WARN_ON_ONCE(walk && memchr_inv(walk->mm_stats, 0, sizeof(walk->mm_stats)));
|
|
|
|
+
|
|
|
|
+ current->reclaim_state->mm_walk = NULL;
|
|
|
|
+
|
|
|
|
+ if (!current_is_kswapd())
|
|
|
|
+ kfree(walk);
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
static void inc_min_seq(struct lruvec *lruvec, int type)
|
|
|
|
{
|
|
|
|
struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
2023-03-25 16:24:27 +00:00
|
|
|
@@ -3136,7 +4001,7 @@ next:
|
2023-03-20 21:51:03 +00:00
|
|
|
return success;
|
|
|
|
}
|
|
|
|
|
|
|
|
-static void inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, bool can_swap)
|
|
|
|
+static void inc_max_seq(struct lruvec *lruvec, bool can_swap)
|
|
|
|
{
|
|
|
|
int prev, next;
|
|
|
|
int type, zone;
|
2023-03-25 16:24:27 +00:00
|
|
|
@@ -3146,9 +4011,6 @@ static void inc_max_seq(struct lruvec *l
|
2023-03-20 21:51:03 +00:00
|
|
|
|
|
|
|
VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
|
|
|
|
|
|
|
|
- if (max_seq != lrugen->max_seq)
|
|
|
|
- goto unlock;
|
|
|
|
-
|
|
|
|
for (type = ANON_AND_FILE - 1; type >= 0; type--) {
|
|
|
|
if (get_nr_gens(lruvec, type) != MAX_NR_GENS)
|
|
|
|
continue;
|
2023-03-25 16:24:27 +00:00
|
|
|
@@ -3186,10 +4048,76 @@ static void inc_max_seq(struct lruvec *l
|
2023-03-20 21:51:03 +00:00
|
|
|
|
|
|
|
/* make sure preceding modifications appear */
|
|
|
|
smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1);
|
|
|
|
-unlock:
|
|
|
|
+
|
|
|
|
spin_unlock_irq(&lruvec->lru_lock);
|
|
|
|
}
|
|
|
|
|
|
|
|
+static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
|
|
|
|
+ struct scan_control *sc, bool can_swap)
|
|
|
|
+{
|
|
|
|
+ bool success;
|
|
|
|
+ struct lru_gen_mm_walk *walk;
|
|
|
|
+ struct mm_struct *mm = NULL;
|
|
|
|
+ struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
|
|
|
+
|
|
|
|
+ VM_WARN_ON_ONCE(max_seq > READ_ONCE(lrugen->max_seq));
|
|
|
|
+
|
|
|
|
+ /* see the comment in iterate_mm_list() */
|
|
|
|
+ if (max_seq <= READ_ONCE(lruvec->mm_state.seq)) {
|
|
|
|
+ success = false;
|
|
|
|
+ goto done;
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ /*
|
|
|
|
+ * If the hardware doesn't automatically set the accessed bit, fallback
|
|
|
|
+ * to lru_gen_look_around(), which only clears the accessed bit in a
|
|
|
|
+ * handful of PTEs. Spreading the work out over a period of time usually
|
|
|
|
+ * is less efficient, but it avoids bursty page faults.
|
|
|
|
+ */
|
|
|
|
+ if (!arch_has_hw_pte_young()) {
|
|
|
|
+ success = iterate_mm_list_nowalk(lruvec, max_seq);
|
|
|
|
+ goto done;
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ walk = set_mm_walk(NULL);
|
|
|
|
+ if (!walk) {
|
|
|
|
+ success = iterate_mm_list_nowalk(lruvec, max_seq);
|
|
|
|
+ goto done;
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ walk->lruvec = lruvec;
|
|
|
|
+ walk->max_seq = max_seq;
|
|
|
|
+ walk->can_swap = can_swap;
|
|
|
|
+ walk->force_scan = false;
|
|
|
|
+
|
|
|
|
+ do {
|
|
|
|
+ success = iterate_mm_list(lruvec, walk, &mm);
|
|
|
|
+ if (mm)
|
|
|
|
+ walk_mm(lruvec, mm, walk);
|
|
|
|
+
|
|
|
|
+ cond_resched();
|
|
|
|
+ } while (mm);
|
|
|
|
+done:
|
|
|
|
+ if (!success) {
|
|
|
|
+ if (sc->priority <= DEF_PRIORITY - 2)
|
|
|
|
+ wait_event_killable(lruvec->mm_state.wait,
|
|
|
|
+ max_seq < READ_ONCE(lrugen->max_seq));
|
|
|
|
+
|
|
|
|
+ return max_seq < READ_ONCE(lrugen->max_seq);
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ VM_WARN_ON_ONCE(max_seq != READ_ONCE(lrugen->max_seq));
|
|
|
|
+
|
|
|
|
+ inc_max_seq(lruvec, can_swap);
|
|
|
|
+ /* either this sees any waiters or they will see updated max_seq */
|
|
|
|
+ if (wq_has_sleeper(&lruvec->mm_state.wait))
|
|
|
|
+ wake_up_all(&lruvec->mm_state.wait);
|
|
|
|
+
|
|
|
|
+ wakeup_flusher_threads(WB_REASON_VMSCAN);
|
|
|
|
+
|
|
|
|
+ return true;
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, unsigned long *min_seq,
|
|
|
|
struct scan_control *sc, bool can_swap, unsigned long *nr_to_scan)
|
|
|
|
{
|
2023-03-25 16:24:27 +00:00
|
|
|
@@ -3265,7 +4193,7 @@ static void age_lruvec(struct lruvec *lr
|
2023-03-20 21:51:03 +00:00
|
|
|
|
|
|
|
need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, swappiness, &nr_to_scan);
|
|
|
|
if (need_aging)
|
|
|
|
- inc_max_seq(lruvec, max_seq, swappiness);
|
|
|
|
+ try_to_inc_max_seq(lruvec, max_seq, sc, swappiness);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
|
2023-03-25 16:24:27 +00:00
|
|
|
@@ -3274,6 +4202,8 @@ static void lru_gen_age_node(struct pgli
|
2023-03-20 21:51:03 +00:00
|
|
|
|
|
|
|
VM_WARN_ON_ONCE(!current_is_kswapd());
|
|
|
|
|
|
|
|
+ set_mm_walk(pgdat);
|
|
|
|
+
|
|
|
|
memcg = mem_cgroup_iter(NULL, NULL, NULL);
|
|
|
|
do {
|
|
|
|
struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
|
2023-03-25 16:24:27 +00:00
|
|
|
@@ -3282,11 +4212,16 @@ static void lru_gen_age_node(struct pgli
|
2023-03-20 21:51:03 +00:00
|
|
|
|
|
|
|
cond_resched();
|
|
|
|
} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
|
|
|
|
+
|
|
|
|
+ clear_mm_walk();
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This function exploits spatial locality when shrink_page_list() walks the
|
|
|
|
- * rmap. It scans the adjacent PTEs of a young PTE and promotes hot pages.
|
|
|
|
+ * rmap. It scans the adjacent PTEs of a young PTE and promotes hot pages. If
|
|
|
|
+ * the scan was done cacheline efficiently, it adds the PMD entry pointing to
|
|
|
|
+ * the PTE table to the Bloom filter. This forms a feedback loop between the
|
|
|
|
+ * eviction and the aging.
|
|
|
|
*/
|
|
|
|
void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
|
|
|
|
{
|
2023-03-25 16:24:27 +00:00
|
|
|
@@ -3295,6 +4230,8 @@ void lru_gen_look_around(struct page_vma
|
2023-03-20 21:51:03 +00:00
|
|
|
unsigned long start;
|
|
|
|
unsigned long end;
|
|
|
|
unsigned long addr;
|
|
|
|
+ struct lru_gen_mm_walk *walk;
|
|
|
|
+ int young = 0;
|
|
|
|
unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)] = {};
|
|
|
|
struct page *page = pvmw->page;
|
|
|
|
struct mem_cgroup *memcg = page_memcg(page);
|
2023-03-25 16:24:27 +00:00
|
|
|
@@ -3309,6 +4246,9 @@ void lru_gen_look_around(struct page_vma
|
2023-03-20 21:51:03 +00:00
|
|
|
if (spin_is_contended(pvmw->ptl))
|
|
|
|
return;
|
|
|
|
|
|
|
|
+ /* avoid taking the LRU lock under the PTL when possible */
|
|
|
|
+ walk = current->reclaim_state ? current->reclaim_state->mm_walk : NULL;
|
|
|
|
+
|
|
|
|
start = max(pvmw->address & PMD_MASK, pvmw->vma->vm_start);
|
|
|
|
end = min(pvmw->address | ~PMD_MASK, pvmw->vma->vm_end - 1) + 1;
|
|
|
|
|
2023-03-25 16:24:27 +00:00
|
|
|
@@ -3338,13 +4278,15 @@ void lru_gen_look_around(struct page_vma
|
2023-03-20 21:51:03 +00:00
|
|
|
if (!pte_young(pte[i]))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
- page = get_pfn_page(pfn, memcg, pgdat);
|
|
|
|
+ page = get_pfn_page(pfn, memcg, pgdat, !walk || walk->can_swap);
|
|
|
|
if (!page)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (!ptep_test_and_clear_young(pvmw->vma, addr, pte + i))
|
|
|
|
VM_WARN_ON_ONCE(true);
|
|
|
|
|
|
|
|
+ young++;
|
|
|
|
+
|
|
|
|
if (pte_dirty(pte[i]) && !PageDirty(page) &&
|
|
|
|
!(PageAnon(page) && PageSwapBacked(page) &&
|
|
|
|
!PageSwapCache(page)))
|
2023-03-25 16:24:27 +00:00
|
|
|
@@ -3360,7 +4302,11 @@ void lru_gen_look_around(struct page_vma
|
2023-03-20 21:51:03 +00:00
|
|
|
arch_leave_lazy_mmu_mode();
|
|
|
|
rcu_read_unlock();
|
|
|
|
|
|
|
|
- if (bitmap_weight(bitmap, MIN_LRU_BATCH) < PAGEVEC_SIZE) {
|
|
|
|
+ /* feedback from rmap walkers to page table walkers */
|
|
|
|
+ if (suitable_to_scan(i, young))
|
|
|
|
+ update_bloom_filter(lruvec, max_seq, pvmw->pmd);
|
|
|
|
+
|
|
|
|
+ if (!walk && bitmap_weight(bitmap, MIN_LRU_BATCH) < PAGEVEC_SIZE) {
|
|
|
|
for_each_set_bit(i, bitmap, MIN_LRU_BATCH) {
|
|
|
|
page = pte_page(pte[i]);
|
|
|
|
activate_page(page);
|
2023-03-25 16:24:27 +00:00
|
|
|
@@ -3372,8 +4318,10 @@ void lru_gen_look_around(struct page_vma
|
2023-03-20 21:51:03 +00:00
|
|
|
if (!mem_cgroup_trylock_pages(memcg))
|
|
|
|
return;
|
|
|
|
|
|
|
|
- spin_lock_irq(&lruvec->lru_lock);
|
|
|
|
- new_gen = lru_gen_from_seq(lruvec->lrugen.max_seq);
|
|
|
|
+ if (!walk) {
|
|
|
|
+ spin_lock_irq(&lruvec->lru_lock);
|
|
|
|
+ new_gen = lru_gen_from_seq(lruvec->lrugen.max_seq);
|
|
|
|
+ }
|
|
|
|
|
|
|
|
for_each_set_bit(i, bitmap, MIN_LRU_BATCH) {
|
|
|
|
page = compound_head(pte_page(pte[i]));
|
2023-03-25 16:24:27 +00:00
|
|
|
@@ -3384,10 +4332,14 @@ void lru_gen_look_around(struct page_vma
|
2023-03-20 21:51:03 +00:00
|
|
|
if (old_gen < 0 || old_gen == new_gen)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
- lru_gen_update_size(lruvec, page, old_gen, new_gen);
|
|
|
|
+ if (walk)
|
|
|
|
+ update_batch_size(walk, page, old_gen, new_gen);
|
|
|
|
+ else
|
|
|
|
+ lru_gen_update_size(lruvec, page, old_gen, new_gen);
|
|
|
|
}
|
|
|
|
|
|
|
|
- spin_unlock_irq(&lruvec->lru_lock);
|
|
|
|
+ if (!walk)
|
|
|
|
+ spin_unlock_irq(&lruvec->lru_lock);
|
|
|
|
|
|
|
|
mem_cgroup_unlock_pages();
|
|
|
|
}
|
2023-03-25 16:24:27 +00:00
|
|
|
@@ -3670,6 +4622,7 @@ static int evict_pages(struct lruvec *lr
|
2023-03-20 21:51:03 +00:00
|
|
|
struct page *page;
|
|
|
|
enum vm_event_item item;
|
|
|
|
struct reclaim_stat stat;
|
|
|
|
+ struct lru_gen_mm_walk *walk;
|
|
|
|
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
|
|
|
|
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
|
|
|
|
|
2023-03-25 16:24:27 +00:00
|
|
|
@@ -3706,6 +4659,10 @@ static int evict_pages(struct lruvec *lr
|
2023-03-20 21:51:03 +00:00
|
|
|
|
|
|
|
move_pages_to_lru(lruvec, &list);
|
|
|
|
|
|
|
|
+ walk = current->reclaim_state->mm_walk;
|
|
|
|
+ if (walk && walk->batched)
|
|
|
|
+ reset_batch_size(lruvec, walk);
|
|
|
|
+
|
|
|
|
item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT;
|
|
|
|
if (!cgroup_reclaim(sc))
|
|
|
|
__count_vm_events(item, reclaimed);
|
2023-03-25 16:24:27 +00:00
|
|
|
@@ -3722,6 +4679,11 @@ static int evict_pages(struct lruvec *lr
|
2023-03-20 21:51:03 +00:00
|
|
|
return scanned;
|
|
|
|
}
|
|
|
|
|
|
|
|
+/*
|
|
|
|
+ * For future optimizations:
|
|
|
|
+ * 1. Defer try_to_inc_max_seq() to workqueues to reduce latency for memcg
|
|
|
|
+ * reclaim.
|
|
|
|
+ */
|
|
|
|
static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc,
|
|
|
|
bool can_swap)
|
|
|
|
{
|
2023-03-25 16:24:27 +00:00
|
|
|
@@ -3747,7 +4709,8 @@ static unsigned long get_nr_to_scan(stru
|
2023-03-20 21:51:03 +00:00
|
|
|
if (current_is_kswapd())
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
- inc_max_seq(lruvec, max_seq, can_swap);
|
|
|
|
+ if (try_to_inc_max_seq(lruvec, max_seq, sc, can_swap))
|
|
|
|
+ return nr_to_scan;
|
|
|
|
done:
|
|
|
|
return min_seq[!can_swap] + MIN_NR_GENS <= max_seq ? nr_to_scan : 0;
|
|
|
|
}
|
2023-03-25 16:24:27 +00:00
|
|
|
@@ -3761,6 +4724,8 @@ static void lru_gen_shrink_lruvec(struct
|
2023-03-20 21:51:03 +00:00
|
|
|
|
|
|
|
blk_start_plug(&plug);
|
|
|
|
|
|
|
|
+ set_mm_walk(lruvec_pgdat(lruvec));
|
|
|
|
+
|
|
|
|
while (true) {
|
|
|
|
int delta;
|
|
|
|
int swappiness;
|
2023-03-25 16:24:27 +00:00
|
|
|
@@ -3788,6 +4753,8 @@ static void lru_gen_shrink_lruvec(struct
|
2023-03-20 21:51:03 +00:00
|
|
|
cond_resched();
|
|
|
|
}
|
|
|
|
|
|
|
|
+ clear_mm_walk();
|
|
|
|
+
|
|
|
|
blk_finish_plug(&plug);
|
|
|
|
}
|
|
|
|
|
2023-03-25 16:24:27 +00:00
|
|
|
@@ -3804,15 +4771,21 @@ void lru_gen_init_lruvec(struct lruvec *
|
2023-03-20 21:51:03 +00:00
|
|
|
|
|
|
|
for_each_gen_type_zone(gen, type, zone)
|
|
|
|
INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]);
|
|
|
|
+
|
|
|
|
+ lruvec->mm_state.seq = MIN_NR_GENS;
|
|
|
|
+ init_waitqueue_head(&lruvec->mm_state.wait);
|
|
|
|
}
|
|
|
|
|
|
|
|
#ifdef CONFIG_MEMCG
|
|
|
|
void lru_gen_init_memcg(struct mem_cgroup *memcg)
|
|
|
|
{
|
|
|
|
+ INIT_LIST_HEAD(&memcg->mm_list.fifo);
|
|
|
|
+ spin_lock_init(&memcg->mm_list.lock);
|
|
|
|
}
|
|
|
|
|
|
|
|
void lru_gen_exit_memcg(struct mem_cgroup *memcg)
|
|
|
|
{
|
|
|
|
+ int i;
|
|
|
|
int nid;
|
|
|
|
|
|
|
|
for_each_node(nid) {
|
2023-03-25 16:24:27 +00:00
|
|
|
@@ -3820,6 +4793,11 @@ void lru_gen_exit_memcg(struct mem_cgrou
|
2023-03-20 21:51:03 +00:00
|
|
|
|
|
|
|
VM_WARN_ON_ONCE(memchr_inv(lruvec->lrugen.nr_pages, 0,
|
|
|
|
sizeof(lruvec->lrugen.nr_pages)));
|
|
|
|
+
|
|
|
|
+ for (i = 0; i < NR_BLOOM_FILTERS; i++) {
|
|
|
|
+ bitmap_free(lruvec->mm_state.filters[i]);
|
|
|
|
+ lruvec->mm_state.filters[i] = NULL;
|
|
|
|
+ }
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#endif
|