mirror of
https://github.com/openwrt/openwrt.git
synced 2025-01-25 05:47:00 +00:00
708a507af0
Refresh kernel patches for generic kernel 5.15 due to new backport version of MGLRU patchset. Signed-off-by: Christian Marangi <ansuelsmth@gmail.com>
492 lines
14 KiB
Diff
492 lines
14 KiB
Diff
From e4277535f6d6708bb19b88c4bad155832671d69b Mon Sep 17 00:00:00 2001
|
|
From: Yu Zhao <yuzhao@google.com>
|
|
Date: Sun, 18 Sep 2022 02:00:04 -0600
|
|
Subject: [PATCH 07/29] mm: multi-gen LRU: exploit locality in rmap
|
|
MIME-Version: 1.0
|
|
Content-Type: text/plain; charset=UTF-8
|
|
Content-Transfer-Encoding: 8bit
|
|
|
|
Searching the rmap for PTEs mapping each page on an LRU list (to test and
|
|
clear the accessed bit) can be expensive because pages from different VMAs
|
|
(PA space) are not cache friendly to the rmap (VA space). For workloads
|
|
mostly using mapped pages, searching the rmap can incur the highest CPU
|
|
cost in the reclaim path.
|
|
|
|
This patch exploits spatial locality to reduce the trips into the rmap.
|
|
When shrink_page_list() walks the rmap and finds a young PTE, a new
|
|
function lru_gen_look_around() scans at most BITS_PER_LONG-1 adjacent
|
|
PTEs. On finding another young PTE, it clears the accessed bit and
|
|
updates the gen counter of the page mapped by this PTE to
|
|
(max_seq%MAX_NR_GENS)+1.
|
|
|
|
Server benchmark results:
|
|
Single workload:
|
|
fio (buffered I/O): no change
|
|
|
|
Single workload:
|
|
memcached (anon): +[3, 5]%
|
|
Ops/sec KB/sec
|
|
patch1-6: 1106168.46 43025.04
|
|
patch1-7: 1147696.57 44640.29
|
|
|
|
Configurations:
|
|
no change
|
|
|
|
Client benchmark results:
|
|
kswapd profiles:
|
|
patch1-6
|
|
39.03% lzo1x_1_do_compress (real work)
|
|
18.47% page_vma_mapped_walk (overhead)
|
|
6.74% _raw_spin_unlock_irq
|
|
3.97% do_raw_spin_lock
|
|
2.49% ptep_clear_flush
|
|
2.48% anon_vma_interval_tree_iter_first
|
|
1.92% page_referenced_one
|
|
1.88% __zram_bvec_write
|
|
1.48% memmove
|
|
1.31% vma_interval_tree_iter_next
|
|
|
|
patch1-7
|
|
48.16% lzo1x_1_do_compress (real work)
|
|
8.20% page_vma_mapped_walk (overhead)
|
|
7.06% _raw_spin_unlock_irq
|
|
2.92% ptep_clear_flush
|
|
2.53% __zram_bvec_write
|
|
2.11% do_raw_spin_lock
|
|
2.02% memmove
|
|
1.93% lru_gen_look_around
|
|
1.56% free_unref_page_list
|
|
1.40% memset
|
|
|
|
Configurations:
|
|
no change
|
|
|
|
Link: https://lkml.kernel.org/r/20220918080010.2920238-8-yuzhao@google.com
|
|
Signed-off-by: Yu Zhao <yuzhao@google.com>
|
|
Acked-by: Barry Song <baohua@kernel.org>
|
|
Acked-by: Brian Geffon <bgeffon@google.com>
|
|
Acked-by: Jan Alexander Steffens (heftig) <heftig@archlinux.org>
|
|
Acked-by: Oleksandr Natalenko <oleksandr@natalenko.name>
|
|
Acked-by: Steven Barrett <steven@liquorix.net>
|
|
Acked-by: Suleiman Souhlal <suleiman@google.com>
|
|
Tested-by: Daniel Byrne <djbyrne@mtu.edu>
|
|
Tested-by: Donald Carr <d@chaos-reins.com>
|
|
Tested-by: Holger Hoffstätte <holger@applied-asynchrony.com>
|
|
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
|
|
Tested-by: Shuang Zhai <szhai2@cs.rochester.edu>
|
|
Tested-by: Sofia Trinh <sofia.trinh@edi.works>
|
|
Tested-by: Vaibhav Jain <vaibhav@linux.ibm.com>
|
|
Cc: Andi Kleen <ak@linux.intel.com>
|
|
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
|
|
Cc: Catalin Marinas <catalin.marinas@arm.com>
|
|
Cc: Dave Hansen <dave.hansen@linux.intel.com>
|
|
Cc: Hillf Danton <hdanton@sina.com>
|
|
Cc: Jens Axboe <axboe@kernel.dk>
|
|
Cc: Johannes Weiner <hannes@cmpxchg.org>
|
|
Cc: Jonathan Corbet <corbet@lwn.net>
|
|
Cc: Linus Torvalds <torvalds@linux-foundation.org>
|
|
Cc: Matthew Wilcox <willy@infradead.org>
|
|
Cc: Mel Gorman <mgorman@suse.de>
|
|
Cc: Miaohe Lin <linmiaohe@huawei.com>
|
|
Cc: Michael Larabel <Michael@MichaelLarabel.com>
|
|
Cc: Michal Hocko <mhocko@kernel.org>
|
|
Cc: Mike Rapoport <rppt@kernel.org>
|
|
Cc: Mike Rapoport <rppt@linux.ibm.com>
|
|
Cc: Peter Zijlstra <peterz@infradead.org>
|
|
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
|
|
Cc: Tejun Heo <tj@kernel.org>
|
|
Cc: Vlastimil Babka <vbabka@suse.cz>
|
|
Cc: Will Deacon <will@kernel.org>
|
|
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
|
---
|
|
include/linux/memcontrol.h | 31 +++++++
|
|
include/linux/mmzone.h | 6 ++
|
|
mm/internal.h | 1 +
|
|
mm/memcontrol.c | 1 +
|
|
mm/rmap.c | 7 ++
|
|
mm/swap.c | 4 +-
|
|
mm/vmscan.c | 184 +++++++++++++++++++++++++++++++++++++
|
|
7 files changed, 232 insertions(+), 2 deletions(-)
|
|
|
|
--- a/include/linux/memcontrol.h
|
|
+++ b/include/linux/memcontrol.h
|
|
@@ -442,6 +442,7 @@ static inline struct obj_cgroup *__page_
|
|
* - LRU isolation
|
|
* - lock_page_memcg()
|
|
* - exclusive reference
|
|
+ * - mem_cgroup_trylock_pages()
|
|
*
|
|
* For a kmem page a caller should hold an rcu read lock to protect memcg
|
|
* associated with a kmem page from being released.
|
|
@@ -497,6 +498,7 @@ static inline struct mem_cgroup *page_me
|
|
* - LRU isolation
|
|
* - lock_page_memcg()
|
|
* - exclusive reference
|
|
+ * - mem_cgroup_trylock_pages()
|
|
*
|
|
* For a kmem page a caller should hold an rcu read lock to protect memcg
|
|
* associated with a kmem page from being released.
|
|
@@ -953,6 +955,23 @@ void unlock_page_memcg(struct page *page
|
|
|
|
void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val);
|
|
|
|
+/* try to stablize page_memcg() for all the pages in a memcg */
|
|
+static inline bool mem_cgroup_trylock_pages(struct mem_cgroup *memcg)
|
|
+{
|
|
+ rcu_read_lock();
|
|
+
|
|
+ if (mem_cgroup_disabled() || !atomic_read(&memcg->moving_account))
|
|
+ return true;
|
|
+
|
|
+ rcu_read_unlock();
|
|
+ return false;
|
|
+}
|
|
+
|
|
+static inline void mem_cgroup_unlock_pages(void)
|
|
+{
|
|
+ rcu_read_unlock();
|
|
+}
|
|
+
|
|
/* idx can be of type enum memcg_stat_item or node_stat_item */
|
|
static inline void mod_memcg_state(struct mem_cgroup *memcg,
|
|
int idx, int val)
|
|
@@ -1369,6 +1388,18 @@ static inline void unlock_page_memcg(str
|
|
{
|
|
}
|
|
|
|
+static inline bool mem_cgroup_trylock_pages(struct mem_cgroup *memcg)
|
|
+{
|
|
+ /* to match page_memcg_rcu() */
|
|
+ rcu_read_lock();
|
|
+ return true;
|
|
+}
|
|
+
|
|
+static inline void mem_cgroup_unlock_pages(void)
|
|
+{
|
|
+ rcu_read_unlock();
|
|
+}
|
|
+
|
|
static inline void mem_cgroup_handle_over_high(void)
|
|
{
|
|
}
|
|
--- a/include/linux/mmzone.h
|
|
+++ b/include/linux/mmzone.h
|
|
@@ -352,6 +352,7 @@ enum lruvec_flags {
|
|
#ifndef __GENERATING_BOUNDS_H
|
|
|
|
struct lruvec;
|
|
+struct page_vma_mapped_walk;
|
|
|
|
#define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF)
|
|
#define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF)
|
|
@@ -407,6 +408,7 @@ struct lru_gen_struct {
|
|
};
|
|
|
|
void lru_gen_init_lruvec(struct lruvec *lruvec);
|
|
+void lru_gen_look_around(struct page_vma_mapped_walk *pvmw);
|
|
|
|
#ifdef CONFIG_MEMCG
|
|
void lru_gen_init_memcg(struct mem_cgroup *memcg);
|
|
@@ -419,6 +421,10 @@ static inline void lru_gen_init_lruvec(s
|
|
{
|
|
}
|
|
|
|
+static inline void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
|
|
+{
|
|
+}
|
|
+
|
|
#ifdef CONFIG_MEMCG
|
|
static inline void lru_gen_init_memcg(struct mem_cgroup *memcg)
|
|
{
|
|
--- a/mm/internal.h
|
|
+++ b/mm/internal.h
|
|
@@ -35,6 +35,7 @@
|
|
void page_writeback_init(void);
|
|
|
|
vm_fault_t do_swap_page(struct vm_fault *vmf);
|
|
+void activate_page(struct page *page);
|
|
|
|
void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
|
|
unsigned long floor, unsigned long ceiling);
|
|
--- a/mm/memcontrol.c
|
|
+++ b/mm/memcontrol.c
|
|
@@ -2798,6 +2798,7 @@ static void commit_charge(struct page *p
|
|
* - LRU isolation
|
|
* - lock_page_memcg()
|
|
* - exclusive reference
|
|
+ * - mem_cgroup_trylock_pages()
|
|
*/
|
|
page->memcg_data = (unsigned long)memcg;
|
|
}
|
|
--- a/mm/rmap.c
|
|
+++ b/mm/rmap.c
|
|
@@ -73,6 +73,7 @@
|
|
#include <linux/page_idle.h>
|
|
#include <linux/memremap.h>
|
|
#include <linux/userfaultfd_k.h>
|
|
+#include <linux/mm_inline.h>
|
|
|
|
#include <asm/tlbflush.h>
|
|
|
|
@@ -793,6 +794,12 @@ static bool page_referenced_one(struct p
|
|
}
|
|
|
|
if (pvmw.pte) {
|
|
+ if (lru_gen_enabled() && pte_young(*pvmw.pte) &&
|
|
+ !(vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ))) {
|
|
+ lru_gen_look_around(&pvmw);
|
|
+ referenced++;
|
|
+ }
|
|
+
|
|
if (ptep_clear_flush_young_notify(vma, address,
|
|
pvmw.pte)) {
|
|
/*
|
|
--- a/mm/swap.c
|
|
+++ b/mm/swap.c
|
|
@@ -325,7 +325,7 @@ static bool need_activate_page_drain(int
|
|
return pagevec_count(&per_cpu(lru_pvecs.activate_page, cpu)) != 0;
|
|
}
|
|
|
|
-static void activate_page(struct page *page)
|
|
+void activate_page(struct page *page)
|
|
{
|
|
page = compound_head(page);
|
|
if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
|
|
@@ -345,7 +345,7 @@ static inline void activate_page_drain(i
|
|
{
|
|
}
|
|
|
|
-static void activate_page(struct page *page)
|
|
+void activate_page(struct page *page)
|
|
{
|
|
struct lruvec *lruvec;
|
|
|
|
--- a/mm/vmscan.c
|
|
+++ b/mm/vmscan.c
|
|
@@ -1409,6 +1409,11 @@ retry:
|
|
if (!sc->may_unmap && page_mapped(page))
|
|
goto keep_locked;
|
|
|
|
+ /* page_update_gen() tried to promote this page? */
|
|
+ if (lru_gen_enabled() && !ignore_references &&
|
|
+ page_mapped(page) && PageReferenced(page))
|
|
+ goto keep_locked;
|
|
+
|
|
may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
|
|
(PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
|
|
|
|
@@ -2990,6 +2995,29 @@ static bool positive_ctrl_err(struct ctr
|
|
* the aging
|
|
******************************************************************************/
|
|
|
|
+/* promote pages accessed through page tables */
|
|
+static int page_update_gen(struct page *page, int gen)
|
|
+{
|
|
+ unsigned long new_flags, old_flags = READ_ONCE(page->flags);
|
|
+
|
|
+ VM_WARN_ON_ONCE(gen >= MAX_NR_GENS);
|
|
+ VM_WARN_ON_ONCE(!rcu_read_lock_held());
|
|
+
|
|
+ do {
|
|
+ /* lru_gen_del_page() has isolated this page? */
|
|
+ if (!(old_flags & LRU_GEN_MASK)) {
|
|
+ /* for shrink_page_list() */
|
|
+ new_flags = old_flags | BIT(PG_referenced);
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS);
|
|
+ new_flags |= (gen + 1UL) << LRU_GEN_PGOFF;
|
|
+ } while (!try_cmpxchg(&page->flags, &old_flags, new_flags));
|
|
+
|
|
+ return ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
|
|
+}
|
|
+
|
|
/* protect pages accessed multiple times through file descriptors */
|
|
static int page_inc_gen(struct lruvec *lruvec, struct page *page, bool reclaiming)
|
|
{
|
|
@@ -3001,6 +3029,11 @@ static int page_inc_gen(struct lruvec *l
|
|
VM_WARN_ON_ONCE_PAGE(!(old_flags & LRU_GEN_MASK), page);
|
|
|
|
do {
|
|
+ new_gen = ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
|
|
+ /* page_update_gen() has promoted this page? */
|
|
+ if (new_gen >= 0 && new_gen != old_gen)
|
|
+ return new_gen;
|
|
+
|
|
new_gen = (old_gen + 1) % MAX_NR_GENS;
|
|
|
|
new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS);
|
|
@@ -3015,6 +3048,43 @@ static int page_inc_gen(struct lruvec *l
|
|
return new_gen;
|
|
}
|
|
|
|
+static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned long addr)
|
|
+{
|
|
+ unsigned long pfn = pte_pfn(pte);
|
|
+
|
|
+ VM_WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end);
|
|
+
|
|
+ if (!pte_present(pte) || is_zero_pfn(pfn))
|
|
+ return -1;
|
|
+
|
|
+ if (WARN_ON_ONCE(pte_devmap(pte) || pte_special(pte)))
|
|
+ return -1;
|
|
+
|
|
+ if (WARN_ON_ONCE(!pfn_valid(pfn)))
|
|
+ return -1;
|
|
+
|
|
+ return pfn;
|
|
+}
|
|
+
|
|
+static struct page *get_pfn_page(unsigned long pfn, struct mem_cgroup *memcg,
|
|
+ struct pglist_data *pgdat)
|
|
+{
|
|
+ struct page *page;
|
|
+
|
|
+ /* try to avoid unnecessary memory loads */
|
|
+ if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat))
|
|
+ return NULL;
|
|
+
|
|
+ page = compound_head(pfn_to_page(pfn));
|
|
+ if (page_to_nid(page) != pgdat->node_id)
|
|
+ return NULL;
|
|
+
|
|
+ if (page_memcg_rcu(page) != memcg)
|
|
+ return NULL;
|
|
+
|
|
+ return page;
|
|
+}
|
|
+
|
|
static void inc_min_seq(struct lruvec *lruvec, int type)
|
|
{
|
|
struct lru_gen_struct *lrugen = &lruvec->lrugen;
|
|
@@ -3214,6 +3284,114 @@ static void lru_gen_age_node(struct pgli
|
|
} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
|
|
}
|
|
|
|
+/*
|
|
+ * This function exploits spatial locality when shrink_page_list() walks the
|
|
+ * rmap. It scans the adjacent PTEs of a young PTE and promotes hot pages.
|
|
+ */
|
|
+void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
|
|
+{
|
|
+ int i;
|
|
+ pte_t *pte;
|
|
+ unsigned long start;
|
|
+ unsigned long end;
|
|
+ unsigned long addr;
|
|
+ unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)] = {};
|
|
+ struct page *page = pvmw->page;
|
|
+ struct mem_cgroup *memcg = page_memcg(page);
|
|
+ struct pglist_data *pgdat = page_pgdat(page);
|
|
+ struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
|
|
+ DEFINE_MAX_SEQ(lruvec);
|
|
+ int old_gen, new_gen = lru_gen_from_seq(max_seq);
|
|
+
|
|
+ lockdep_assert_held(pvmw->ptl);
|
|
+ VM_WARN_ON_ONCE_PAGE(PageLRU(page), page);
|
|
+
|
|
+ if (spin_is_contended(pvmw->ptl))
|
|
+ return;
|
|
+
|
|
+ start = max(pvmw->address & PMD_MASK, pvmw->vma->vm_start);
|
|
+ end = min(pvmw->address | ~PMD_MASK, pvmw->vma->vm_end - 1) + 1;
|
|
+
|
|
+ if (end - start > MIN_LRU_BATCH * PAGE_SIZE) {
|
|
+ if (pvmw->address - start < MIN_LRU_BATCH * PAGE_SIZE / 2)
|
|
+ end = start + MIN_LRU_BATCH * PAGE_SIZE;
|
|
+ else if (end - pvmw->address < MIN_LRU_BATCH * PAGE_SIZE / 2)
|
|
+ start = end - MIN_LRU_BATCH * PAGE_SIZE;
|
|
+ else {
|
|
+ start = pvmw->address - MIN_LRU_BATCH * PAGE_SIZE / 2;
|
|
+ end = pvmw->address + MIN_LRU_BATCH * PAGE_SIZE / 2;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ pte = pvmw->pte - (pvmw->address - start) / PAGE_SIZE;
|
|
+
|
|
+ rcu_read_lock();
|
|
+ arch_enter_lazy_mmu_mode();
|
|
+
|
|
+ for (i = 0, addr = start; addr != end; i++, addr += PAGE_SIZE) {
|
|
+ unsigned long pfn;
|
|
+
|
|
+ pfn = get_pte_pfn(pte[i], pvmw->vma, addr);
|
|
+ if (pfn == -1)
|
|
+ continue;
|
|
+
|
|
+ if (!pte_young(pte[i]))
|
|
+ continue;
|
|
+
|
|
+ page = get_pfn_page(pfn, memcg, pgdat);
|
|
+ if (!page)
|
|
+ continue;
|
|
+
|
|
+ if (!ptep_test_and_clear_young(pvmw->vma, addr, pte + i))
|
|
+ VM_WARN_ON_ONCE(true);
|
|
+
|
|
+ if (pte_dirty(pte[i]) && !PageDirty(page) &&
|
|
+ !(PageAnon(page) && PageSwapBacked(page) &&
|
|
+ !PageSwapCache(page)))
|
|
+ set_page_dirty(page);
|
|
+
|
|
+ old_gen = page_lru_gen(page);
|
|
+ if (old_gen < 0)
|
|
+ SetPageReferenced(page);
|
|
+ else if (old_gen != new_gen)
|
|
+ __set_bit(i, bitmap);
|
|
+ }
|
|
+
|
|
+ arch_leave_lazy_mmu_mode();
|
|
+ rcu_read_unlock();
|
|
+
|
|
+ if (bitmap_weight(bitmap, MIN_LRU_BATCH) < PAGEVEC_SIZE) {
|
|
+ for_each_set_bit(i, bitmap, MIN_LRU_BATCH) {
|
|
+ page = pte_page(pte[i]);
|
|
+ activate_page(page);
|
|
+ }
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ /* page_update_gen() requires stable page_memcg() */
|
|
+ if (!mem_cgroup_trylock_pages(memcg))
|
|
+ return;
|
|
+
|
|
+ spin_lock_irq(&lruvec->lru_lock);
|
|
+ new_gen = lru_gen_from_seq(lruvec->lrugen.max_seq);
|
|
+
|
|
+ for_each_set_bit(i, bitmap, MIN_LRU_BATCH) {
|
|
+ page = compound_head(pte_page(pte[i]));
|
|
+ if (page_memcg_rcu(page) != memcg)
|
|
+ continue;
|
|
+
|
|
+ old_gen = page_update_gen(page, new_gen);
|
|
+ if (old_gen < 0 || old_gen == new_gen)
|
|
+ continue;
|
|
+
|
|
+ lru_gen_update_size(lruvec, page, old_gen, new_gen);
|
|
+ }
|
|
+
|
|
+ spin_unlock_irq(&lruvec->lru_lock);
|
|
+
|
|
+ mem_cgroup_unlock_pages();
|
|
+}
|
|
+
|
|
/******************************************************************************
|
|
* the eviction
|
|
******************************************************************************/
|
|
@@ -3250,6 +3428,12 @@ static bool sort_page(struct lruvec *lru
|
|
return true;
|
|
}
|
|
|
|
+ /* promoted */
|
|
+ if (gen != lru_gen_from_seq(lrugen->min_seq[type])) {
|
|
+ list_move(&page->lru, &lrugen->lists[gen][type][zone]);
|
|
+ return true;
|
|
+ }
|
|
+
|
|
/* protected */
|
|
if (tier > tier_idx) {
|
|
int hist = lru_hist_from_seq(lrugen->min_seq[type]);
|