From f4b881ce07ccb2a519f664afaa2a68225b612ca3 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Tue, 29 Jun 2021 20:46:47 -0600 Subject: [PATCH 07/10] mm: multigenerational lru: eviction The eviction consumes old generations. Given an lruvec, the eviction scans pages on lrugen->lists indexed by anon and file min_seq[] (modulo MAX_NR_GENS). It first tries to select a type based on the values of min_seq[]. If they are equal, it selects the type that has a lower refaulted %. The eviction sorts a page according to its updated generation number if the aging has found this page accessed. It also moves a page to the next generation if this page is from an upper tier that has a higher refaulted % than the base tier. The eviction increments min_seq[] of a selected type when it finds lrugen->lists indexed by min_seq[] of this selected type are empty. Each generation is divided into multiple tiers. Tiers represent different ranges of numbers of accesses from file descriptors only. Pages accessed N times via file descriptors belong to tier order_base_2(N). Each generation contains at most MAX_NR_TIERS tiers, and they require additional MAX_NR_TIERS-2 bits in page->flags. In contrast to moving between generations which requires list operations, moving between tiers only involves operations on page->flags and therefore has a negligible cost. A feedback loop modeled after the PID controller monitors refaulted % across all tiers and decides when to protect pages from which tiers. Unmapped pages are initially added to the oldest generation and then conditionally protected by tiers. Each tier keeps track of how many pages from it have refaulted. Tier 0 is the base tier and pages from it are evicted unconditionally because there are no better candidates. Pages from an upper tier are either evicted or moved to the next generation, depending on whether this upper tier has a higher refaulted % than the base tier. This model has the following advantages: 1) It removes the cost in the buffered access path and reduces the overall cost of protection because pages are conditionally protected in the reclaim path. 2) It takes mapped pages into account and avoids overprotecting pages accessed multiple times via file descriptors. 3 Additional tiers improve the protection of pages accessed more than twice. Signed-off-by: Yu Zhao Tested-by: Konstantin Kharlamov Change-Id: I64c06d8f2cdb83ac7d56c7e1d07f043483956cac --- include/linux/mm_inline.h | 10 + include/linux/mmzone.h | 33 +++ mm/swap.c | 42 +++ mm/vmscan.c | 555 +++++++++++++++++++++++++++++++++++++- mm/workingset.c | 120 ++++++++- 5 files changed, 757 insertions(+), 3 deletions(-) --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -106,6 +106,14 @@ static inline int lru_hist_from_seq(unsi return seq % NR_HIST_GENS; } +/* Convert the number of accesses to a tier. See the comment on MAX_NR_TIERS. */ +static inline int lru_tier_from_refs(int refs) +{ + VM_BUG_ON(refs > BIT(LRU_REFS_WIDTH)); + + return order_base_2(refs + 1); +} + /* The youngest and the second youngest generations are counted as active. */ static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen) { @@ -226,6 +234,8 @@ static inline bool lru_gen_del_page(stru gen = ((new_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; new_flags &= ~LRU_GEN_MASK; + if ((new_flags & LRU_REFS_FLAGS) != LRU_REFS_FLAGS) + new_flags &= ~(LRU_REFS_MASK | LRU_REFS_FLAGS); /* for shrink_page_list() */ if (reclaiming) new_flags &= ~(BIT(PG_referenced) | BIT(PG_reclaim)); --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -319,6 +319,30 @@ struct page_vma_mapped_walk; #define MIN_NR_GENS 2 #define MAX_NR_GENS ((unsigned int)CONFIG_NR_LRU_GENS) +/* + * Each generation is divided into multiple tiers. Tiers represent different + * ranges of numbers of accesses from file descriptors, i.e., + * mark_page_accessed(). In contrast to moving between generations which + * requires the lru lock, moving between tiers only involves an atomic + * operation on page->flags and therefore has a negligible cost. + * + * The purposes of tiers are to: + * 1) estimate whether pages accessed multiple times via file descriptors are + * more active than pages accessed only via page tables by separating the two + * access types into upper tiers and the base tier, and comparing refaulted % + * across all tiers. + * 2) improve buffered io performance by deferring the protection of pages + * accessed multiple times until the eviction. That is the protection happens + * in the reclaim path, not the access path. + * + * Pages accessed N times via file descriptors belong to tier order_base_2(N). + * The base tier may be marked by PageReferenced(). All upper tiers are marked + * by PageReferenced() && PageWorkingset(). Additional bits from page->flags are + * used to support more than one upper tier. + */ +#define MAX_NR_TIERS ((unsigned int)CONFIG_TIERS_PER_GEN) +#define LRU_REFS_FLAGS (BIT(PG_referenced) | BIT(PG_workingset)) + /* Whether to keep stats for historical generations. */ #ifdef CONFIG_LRU_GEN_STATS #define NR_HIST_GENS ((unsigned int)CONFIG_NR_LRU_GENS) @@ -337,6 +361,15 @@ struct lrugen { struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; /* the sizes of the multigenerational lru lists in pages */ unsigned long sizes[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; + /* the exponential moving average of refaulted */ + unsigned long avg_refaulted[ANON_AND_FILE][MAX_NR_TIERS]; + /* the exponential moving average of protected+evicted */ + unsigned long avg_total[ANON_AND_FILE][MAX_NR_TIERS]; + /* the base tier isn't protected, hence the minus one */ + unsigned long protected[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS - 1]; + /* incremented without holding the lru lock */ + atomic_long_t evicted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS]; + atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS]; /* whether the multigenerational lru is enabled */ bool enabled[ANON_AND_FILE]; }; --- a/mm/swap.c +++ b/mm/swap.c @@ -389,6 +389,43 @@ static void __lru_cache_activate_page(st local_unlock(&lru_pvecs.lock); } +#ifdef CONFIG_LRU_GEN +static void page_inc_refs(struct page *page) +{ + unsigned long refs; + unsigned long old_flags, new_flags; + + if (PageUnevictable(page)) + return; + + /* see the comment on MAX_NR_TIERS */ + do { + new_flags = old_flags = READ_ONCE(page->flags); + + if (!(new_flags & BIT(PG_referenced))) { + new_flags |= BIT(PG_referenced); + continue; + } + + if (!(new_flags & BIT(PG_workingset))) { + new_flags |= BIT(PG_workingset); + continue; + } + + refs = new_flags & LRU_REFS_MASK; + refs = min(refs + BIT(LRU_REFS_PGOFF), LRU_REFS_MASK); + + new_flags &= ~LRU_REFS_MASK; + new_flags |= refs; + } while (new_flags != old_flags && + cmpxchg(&page->flags, old_flags, new_flags) != old_flags); +} +#else +static void page_inc_refs(struct page *page) +{ +} +#endif /* CONFIG_LRU_GEN */ + /* * Mark a page as having seen activity. * @@ -403,6 +440,11 @@ void mark_page_accessed(struct page *pag { page = compound_head(page); + if (lru_gen_enabled()) { + page_inc_refs(page); + return; + } + if (!PageReferenced(page)) { SetPageReferenced(page); } else if (PageUnevictable(page)) { --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1145,9 +1145,11 @@ static int __remove_mapping(struct addre if (PageSwapCache(page)) { swp_entry_t swap = { .val = page_private(page) }; - mem_cgroup_swapout(page, swap); + + /* get a shadow entry before page_memcg() is cleared */ if (reclaimed && !mapping_exiting(mapping)) shadow = workingset_eviction(page, target_memcg); + mem_cgroup_swapout(page, swap); __delete_from_swap_cache(page, swap, shadow); xa_unlock_irq(&mapping->i_pages); put_swap_page(page, swap); @@ -1410,6 +1412,11 @@ retry: if (!sc->may_unmap && page_mapped(page)) goto keep_locked; + /* lru_gen_look_around() has updated this page? */ + if (lru_gen_enabled() && !ignore_references && + page_mapped(page) && PageReferenced(page)) + goto keep_locked; + may_enter_fs = (sc->gfp_mask & __GFP_FS) || (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); @@ -2505,6 +2512,9 @@ static void prepare_scan_count(pg_data_t unsigned long file; struct lruvec *target_lruvec; + if (lru_gen_enabled()) + return; + target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); /* @@ -2845,6 +2855,17 @@ static int page_lru_gen(struct page *pag return ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; } +static int page_lru_tier(struct page *page) +{ + int refs; + unsigned long flags = READ_ONCE(page->flags); + + refs = (flags & LRU_REFS_FLAGS) == LRU_REFS_FLAGS ? + ((flags & LRU_REFS_MASK) >> LRU_REFS_PGOFF) + 1 : 0; + + return lru_tier_from_refs(refs); +} + static int get_swappiness(struct mem_cgroup *memcg) { return mem_cgroup_get_nr_swap_pages(memcg) >= MIN_BATCH_SIZE ? @@ -3181,6 +3202,91 @@ done: } /****************************************************************************** + * refault feedback loop + ******************************************************************************/ + +/* + * A feedback loop modeled after the PID controller. Currently supports the + * proportional (P) and the integral (I) terms; the derivative (D) term can be + * added if necessary. The setpoint (SP) is the desired position; the process + * variable (PV) is the measured position. The error is the difference between + * the SP and the PV. A positive error results in a positive control output + * correction, which, in our case, is to allow eviction. + * + * The P term is refaulted % of the current generation being evicted. The I + * term is the exponential moving average of refaulted % of previously evicted + * generations, using the smoothing factor 1/2. + * + * Our goal is to maintain proportional refaulted % across all tiers. + */ +struct ctrl_pos { + unsigned long refaulted; + unsigned long total; + int gain; +}; + +static void read_ctrl_pos(struct lruvec *lruvec, int type, int tier, int gain, + struct ctrl_pos *pos) +{ + struct lrugen *lrugen = &lruvec->evictable; + int hist = lru_hist_from_seq(lrugen->min_seq[type]); + + pos->refaulted = lrugen->avg_refaulted[type][tier] + + atomic_long_read(&lrugen->refaulted[hist][type][tier]); + pos->total = lrugen->avg_total[type][tier] + + atomic_long_read(&lrugen->evicted[hist][type][tier]); + if (tier) + pos->total += lrugen->protected[hist][type][tier - 1]; + pos->gain = gain; +} + +static void reset_ctrl_pos(struct lruvec *lruvec, int gen, int type) +{ + int tier; + int hist = lru_hist_from_seq(gen); + struct lrugen *lrugen = &lruvec->evictable; + bool carryover = gen == lru_gen_from_seq(lrugen->min_seq[type]); + bool clear = carryover ? NR_HIST_GENS == 1 : NR_HIST_GENS > 1; + + if (!carryover && !clear) + return; + + for (tier = 0; tier < MAX_NR_TIERS; tier++) { + if (carryover) { + unsigned long sum; + + sum = lrugen->avg_refaulted[type][tier] + + atomic_long_read(&lrugen->refaulted[hist][type][tier]); + WRITE_ONCE(lrugen->avg_refaulted[type][tier], sum / 2); + + sum = lrugen->avg_total[type][tier] + + atomic_long_read(&lrugen->evicted[hist][type][tier]); + if (tier) + sum += lrugen->protected[hist][type][tier - 1]; + WRITE_ONCE(lrugen->avg_total[type][tier], sum / 2); + } + + if (clear) { + atomic_long_set(&lrugen->refaulted[hist][type][tier], 0); + atomic_long_set(&lrugen->evicted[hist][type][tier], 0); + if (tier) + WRITE_ONCE(lrugen->protected[hist][type][tier - 1], 0); + } + } +} + +static bool positive_ctrl_err(struct ctrl_pos *sp, struct ctrl_pos *pv) +{ + /* + * Allow eviction if the PV has a limited number of refaulted pages or a + * lower refaulted % than the SP. + */ + return pv->refaulted < MIN_BATCH_SIZE || + pv->refaulted * max(sp->total, 1UL) * sp->gain <= + sp->refaulted * max(pv->total, 1UL) * pv->gain; +} + +/****************************************************************************** * the aging ******************************************************************************/ @@ -3200,6 +3306,7 @@ static int page_update_gen(struct page * new_flags &= ~LRU_GEN_MASK; new_flags |= (gen + 1UL) << LRU_GEN_PGOFF; + new_flags &= ~(LRU_REFS_MASK | LRU_REFS_FLAGS); } while (new_flags != old_flags && cmpxchg(&page->flags, old_flags, new_flags) != old_flags); @@ -3231,6 +3338,7 @@ static void page_inc_gen(struct page *pa new_flags &= ~LRU_GEN_MASK; new_flags |= (new_gen + 1UL) << LRU_GEN_PGOFF; + new_flags &= ~(LRU_REFS_MASK | LRU_REFS_FLAGS); /* for end_page_writeback() */ if (reclaiming) new_flags |= BIT(PG_reclaim); @@ -3722,6 +3830,7 @@ static bool inc_min_seq(struct lruvec *l } } + reset_ctrl_pos(lruvec, gen, type); WRITE_ONCE(lrugen->min_seq[type], lrugen->min_seq[type] + 1); return true; @@ -3759,6 +3868,8 @@ next: if (min_seq[type] == lrugen->min_seq[type]) continue; + gen = lru_gen_from_seq(lrugen->min_seq[type]); + reset_ctrl_pos(lruvec, gen, type); WRITE_ONCE(lrugen->min_seq[type], min_seq[type]); success = true; } @@ -3820,6 +3931,9 @@ static void inc_max_seq(struct lruvec *l } } + for (type = 0; type < ANON_AND_FILE; type++) + reset_ctrl_pos(lruvec, gen, type); + WRITE_ONCE(lrugen->timestamps[gen], jiffies); /* make sure all preceding modifications appear first */ smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1); @@ -4101,6 +4215,433 @@ void lru_gen_look_around(struct page_vma } /****************************************************************************** + * the eviction + ******************************************************************************/ + +static bool sort_page(struct page *page, struct lruvec *lruvec, int tier_idx) +{ + bool success; + int gen = page_lru_gen(page); + int type = page_is_file_lru(page); + int zone = page_zonenum(page); + int tier = page_lru_tier(page); + int delta = thp_nr_pages(page); + struct lrugen *lrugen = &lruvec->evictable; + + VM_BUG_ON_PAGE(gen >= MAX_NR_GENS, page); + + /* an mlocked page? */ + if (!page_evictable(page)) { + success = lru_gen_del_page(page, lruvec, true); + VM_BUG_ON_PAGE(!success, page); + SetPageUnevictable(page); + add_page_to_lru_list(page, lruvec); + __count_vm_events(UNEVICTABLE_PGCULLED, delta); + return true; + } + + /* a lazy-free page that has been written into? */ + if (type && PageDirty(page) && PageAnon(page)) { + success = lru_gen_del_page(page, lruvec, true); + VM_BUG_ON_PAGE(!success, page); + SetPageSwapBacked(page); + add_page_to_lru_list_tail(page, lruvec); + return true; + } + + /* page_update_gen() has updated this page? */ + if (gen != lru_gen_from_seq(lrugen->min_seq[type])) { + list_move(&page->lru, &lrugen->lists[gen][type][zone]); + return true; + } + + /* protect this page if its tier has a higher refaulted % */ + if (tier > tier_idx) { + int hist = lru_hist_from_seq(gen); + + page_inc_gen(page, lruvec, false); + WRITE_ONCE(lrugen->protected[hist][type][tier - 1], + lrugen->protected[hist][type][tier - 1] + delta); + __mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type, delta); + return true; + } + + /* mark this page for reclaim if it's pending writeback */ + if (PageWriteback(page) || (type && PageDirty(page))) { + page_inc_gen(page, lruvec, true); + return true; + } + + return false; +} + +static bool isolate_page(struct page *page, struct lruvec *lruvec, struct scan_control *sc) +{ + bool success; + + if (!sc->may_unmap && page_mapped(page)) + return false; + + if (!(sc->may_writepage && (sc->gfp_mask & __GFP_IO)) && + (PageDirty(page) || (PageAnon(page) && !PageSwapCache(page)))) + return false; + + if (!get_page_unless_zero(page)) + return false; + + if (!TestClearPageLRU(page)) { + put_page(page); + return false; + } + + success = lru_gen_del_page(page, lruvec, true); + VM_BUG_ON_PAGE(!success, page); + + return true; +} + +static int scan_pages(struct lruvec *lruvec, struct scan_control *sc, + int type, int tier, struct list_head *list) +{ + int gen, zone; + enum vm_event_item item; + int sorted = 0; + int scanned = 0; + int isolated = 0; + int remaining = MAX_BATCH_SIZE; + struct lrugen *lrugen = &lruvec->evictable; + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + + VM_BUG_ON(!list_empty(list)); + + if (get_nr_gens(lruvec, type) == MIN_NR_GENS) + return 0; + + gen = lru_gen_from_seq(lrugen->min_seq[type]); + + for (zone = sc->reclaim_idx; zone >= 0; zone--) { + LIST_HEAD(moved); + int skipped = 0; + struct list_head *head = &lrugen->lists[gen][type][zone]; + + while (!list_empty(head)) { + struct page *page = lru_to_page(head); + int delta = thp_nr_pages(page); + + VM_BUG_ON_PAGE(PageTail(page), page); + VM_BUG_ON_PAGE(PageUnevictable(page), page); + VM_BUG_ON_PAGE(PageActive(page), page); + VM_BUG_ON_PAGE(page_is_file_lru(page) != type, page); + VM_BUG_ON_PAGE(page_zonenum(page) != zone, page); + + prefetchw_prev_lru_page(page, head, flags); + + scanned += delta; + + if (sort_page(page, lruvec, tier)) + sorted += delta; + else if (isolate_page(page, lruvec, sc)) { + list_add(&page->lru, list); + isolated += delta; + } else { + list_move(&page->lru, &moved); + skipped += delta; + } + + if (!--remaining || max(isolated, skipped) >= MIN_BATCH_SIZE) + break; + } + + if (skipped) { + list_splice(&moved, head); + __count_zid_vm_events(PGSCAN_SKIP, zone, skipped); + } + + if (!remaining || isolated >= MIN_BATCH_SIZE) + break; + } + + item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT; + if (!cgroup_reclaim(sc)) { + __count_vm_events(item, isolated); + __count_vm_events(PGREFILL, sorted); + } + __count_memcg_events(memcg, item, isolated); + __count_memcg_events(memcg, PGREFILL, sorted); + __count_vm_events(PGSCAN_ANON + type, isolated); + + /* + * We may have trouble finding eligible pages due to reclaim_idx, + * may_unmap and may_writepage. Check `remaining` to make sure we won't + * be stuck if we aren't making enough progress. + */ + return isolated || !remaining ? scanned : 0; +} + +static int get_tier_idx(struct lruvec *lruvec, int type) +{ + int tier; + struct ctrl_pos sp, pv; + + /* + * Ideally we don't want to evict upper tiers that have higher refaulted + * %. However, we need to leave a margin for the fluctuation in + * refaulted %. So we use a larger gain factor to make sure upper tiers + * are indeed more active. We choose 2 because the lowest upper tier + * would have twice of refaulted % of the base tier, according to their + * numbers of accesses. + */ + read_ctrl_pos(lruvec, type, 0, 1, &sp); + for (tier = 1; tier < MAX_NR_TIERS; tier++) { + read_ctrl_pos(lruvec, type, tier, 2, &pv); + if (!positive_ctrl_err(&sp, &pv)) + break; + } + + return tier - 1; +} + +static int get_type_to_scan(struct lruvec *lruvec, int swappiness, int *tier_idx) +{ + int type, tier; + struct ctrl_pos sp, pv; + int gain[ANON_AND_FILE] = { swappiness, 200 - swappiness }; + + /* + * Compare refaulted % between the base tiers of anon and file to + * determine which type to evict. Also need to compare refaulted % of + * the upper tiers of the selected type with that of the base tier of + * the other type to determine which tier of the selected type to evict. + */ + read_ctrl_pos(lruvec, 0, 0, gain[0], &sp); + read_ctrl_pos(lruvec, 1, 0, gain[1], &pv); + type = positive_ctrl_err(&sp, &pv); + + read_ctrl_pos(lruvec, !type, 0, gain[!type], &sp); + for (tier = 1; tier < MAX_NR_TIERS; tier++) { + read_ctrl_pos(lruvec, type, tier, gain[type], &pv); + if (!positive_ctrl_err(&sp, &pv)) + break; + } + + *tier_idx = tier - 1; + + return type; +} + +static int isolate_pages(struct lruvec *lruvec, struct scan_control *sc, int swappiness, + int *type_scanned, struct list_head *list) +{ + int i; + int type; + int scanned; + int tier = -1; + DEFINE_MIN_SEQ(lruvec); + + VM_BUG_ON(!seq_is_valid(lruvec)); + + /* + * Try to select a type based on generations and swappiness, and if that + * fails, fall back to get_type_to_scan(). When anon and file are both + * available from the same generation, swappiness 200 is interpreted as + * anon first and swappiness 1 is interpreted as file first. + */ + if (!swappiness) + type = 1; + else if (min_seq[0] < min_seq[1]) + type = 0; + else if (swappiness == 1) + type = 1; + else if (swappiness == 200) + type = 0; + else + type = get_type_to_scan(lruvec, swappiness, &tier); + + for (i = !swappiness; i < ANON_AND_FILE; i++) { + if (tier < 0) + tier = get_tier_idx(lruvec, type); + + scanned = scan_pages(lruvec, sc, type, tier, list); + if (scanned) + break; + + type = !type; + tier = -1; + } + + *type_scanned = type; + + return scanned; +} + +/* Main function used by the foreground, the background and the user-triggered eviction. */ +static int evict_pages(struct lruvec *lruvec, struct scan_control *sc, int swappiness) +{ + int type; + int scanned; + int reclaimed; + LIST_HEAD(list); + struct page *page; + enum vm_event_item item; + struct reclaim_stat stat; + struct mm_walk_args *args; + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + struct pglist_data *pgdat = lruvec_pgdat(lruvec); + + spin_lock_irq(&lruvec->lru_lock); + + scanned = isolate_pages(lruvec, sc, swappiness, &type, &list); + + if (try_to_inc_min_seq(lruvec, swappiness)) + scanned++; + + if (get_nr_gens(lruvec, 1) == MIN_NR_GENS) + scanned = 0; + + spin_unlock_irq(&lruvec->lru_lock); + + if (list_empty(&list)) + return scanned; + + reclaimed = shrink_page_list(&list, pgdat, sc, &stat, false); + /* + * We need to prevent rejected pages from being added back to the same + * lists they were isolated from. Otherwise we may risk looping on them + * forever. + */ + list_for_each_entry(page, &list, lru) { + if (!PageReclaim(page) || !(PageDirty(page) || PageWriteback(page))) + SetPageActive(page); + + ClearPageReferenced(page); + ClearPageWorkingset(page); + } + + spin_lock_irq(&lruvec->lru_lock); + + move_pages_to_lru(lruvec, &list); + + args = current->reclaim_state ? current->reclaim_state->mm_walk_args : NULL; + if (args && args->batch_size) + reset_batch_size(lruvec, args); + + item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT; + if (!cgroup_reclaim(sc)) + __count_vm_events(item, reclaimed); + __count_memcg_events(memcg, item, reclaimed); + __count_vm_events(PGSTEAL_ANON + type, reclaimed); + + spin_unlock_irq(&lruvec->lru_lock); + + mem_cgroup_uncharge_list(&list); + free_unref_page_list(&list); + + sc->nr_reclaimed += reclaimed; + + return scanned; +} + +static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, int swappiness) +{ + bool low; + long nr_to_scan; + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + int priority = sc->priority; + DEFINE_MAX_SEQ(lruvec); + DEFINE_MIN_SEQ(lruvec); + + if (mem_cgroup_below_min(memcg) || + (mem_cgroup_below_low(memcg) && !sc->memcg_low_reclaim)) + return 0; + + if (sc->nr_reclaimed >= sc->nr_to_reclaim) { + priority = DEF_PRIORITY; + sc->force_deactivate = 0; + } + + nr_to_scan = get_nr_evictable(lruvec, sc, swappiness, max_seq, min_seq, &low); + if (!nr_to_scan) + return 0; + + nr_to_scan >>= priority; + + if (!mem_cgroup_online(memcg)) + nr_to_scan++; + + if (!nr_to_scan) + return 0; + + if (current_is_kswapd()) { + /* leave the work to lru_gen_age_node() */ + if (max_seq - min_seq[1] < MIN_NR_GENS) + return 0; + + if (!low) + sc->force_deactivate = 0; + + return nr_to_scan; + } + + if (max_seq - min_seq[1] >= MIN_NR_GENS) + return nr_to_scan; + + /* move onto slab and other memcgs if we haven't tried them all */ + if (!sc->force_deactivate) { + sc->skipped_deactivate = 1; + return 0; + } + + return try_to_inc_max_seq(lruvec, sc, swappiness, max_seq, true) ? nr_to_scan : 0; +} + +static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) +{ + struct blk_plug plug; + long scanned = 0; + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + struct pglist_data *pgdat = lruvec_pgdat(lruvec); + + lru_add_drain(); + + if (current_is_kswapd()) + current->reclaim_state->mm_walk_args = &pgdat->mm_walk_args; + + blk_start_plug(&plug); + + while (true) { + int delta; + int swappiness; + long nr_to_scan; + + if (sc->may_swap) + swappiness = get_swappiness(memcg); + else if (!cgroup_reclaim(sc) && get_swappiness(memcg)) + swappiness = 1; + else + swappiness = 0; + + nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness); + if (!nr_to_scan) + break; + + delta = evict_pages(lruvec, sc, swappiness); + if (!delta) + break; + + scanned += delta; + if (scanned >= nr_to_scan) + break; + + cond_resched(); + } + + blk_finish_plug(&plug); + + if (current_is_kswapd()) + current->reclaim_state->mm_walk_args = NULL; +} + +/****************************************************************************** * state change ******************************************************************************/ @@ -4355,6 +4896,10 @@ static void lru_gen_age_node(struct pgli { } +static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) +{ +} + #endif /* CONFIG_LRU_GEN */ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) @@ -4368,6 +4913,11 @@ static void shrink_lruvec(struct lruvec bool proportional_reclaim; struct blk_plug plug; + if (lru_gen_enabled()) { + lru_gen_shrink_lruvec(lruvec, sc); + return; + } + get_scan_count(lruvec, sc, nr); /* Record the original scan target for proportional adjustments later */ @@ -4839,6 +5389,9 @@ static void snapshot_refaults(struct mem struct lruvec *target_lruvec; unsigned long refaults; + if (lru_gen_enabled()) + return; + target_lruvec = mem_cgroup_lruvec(target_memcg, pgdat); refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_ANON); target_lruvec->refaults[0] = refaults; --- a/mm/workingset.c +++ b/mm/workingset.c @@ -187,7 +187,6 @@ static unsigned int bucket_order __read_ static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction, bool workingset) { - eviction >>= bucket_order; eviction &= EVICTION_MASK; eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid; eviction = (eviction << NODES_SHIFT) | pgdat->node_id; @@ -212,10 +211,117 @@ static void unpack_shadow(void *shadow, *memcgidp = memcgid; *pgdat = NODE_DATA(nid); - *evictionp = entry << bucket_order; + *evictionp = entry; *workingsetp = workingset; } +#ifdef CONFIG_LRU_GEN + +static int page_lru_refs(struct page *page) +{ + unsigned long flags = READ_ONCE(page->flags); + + BUILD_BUG_ON(LRU_GEN_WIDTH + LRU_REFS_WIDTH > BITS_PER_LONG - EVICTION_SHIFT); + + /* see the comment on MAX_NR_TIERS */ + return flags & BIT(PG_workingset) ? (flags & LRU_REFS_MASK) >> LRU_REFS_PGOFF : 0; +} + +/* Return a token to be stored in the shadow entry of a page being evicted. */ +static void *lru_gen_eviction(struct page *page) +{ + int hist, tier; + unsigned long token; + unsigned long min_seq; + struct lruvec *lruvec; + struct lrugen *lrugen; + int type = page_is_file_lru(page); + int refs = page_lru_refs(page); + int delta = thp_nr_pages(page); + bool workingset = PageWorkingset(page); + struct mem_cgroup *memcg = page_memcg(page); + struct pglist_data *pgdat = page_pgdat(page); + + lruvec = mem_cgroup_lruvec(memcg, pgdat); + lrugen = &lruvec->evictable; + min_seq = READ_ONCE(lrugen->min_seq[type]); + token = (min_seq << LRU_REFS_WIDTH) | refs; + + hist = lru_hist_from_seq(min_seq); + tier = lru_tier_from_refs(refs + workingset); + atomic_long_add(delta, &lrugen->evicted[hist][type][tier]); + + return pack_shadow(mem_cgroup_id(memcg), pgdat, token, workingset); +} + +/* Count a refaulted page based on the token stored in its shadow entry. */ +static void lru_gen_refault(struct page *page, void *shadow) +{ + int hist, tier, refs; + int memcg_id; + bool workingset; + unsigned long token; + unsigned long min_seq; + struct lruvec *lruvec; + struct lrugen *lrugen; + struct mem_cgroup *memcg; + struct pglist_data *pgdat; + int type = page_is_file_lru(page); + int delta = thp_nr_pages(page); + + unpack_shadow(shadow, &memcg_id, &pgdat, &token, &workingset); + if (page_pgdat(page) != pgdat) + return; + + rcu_read_lock(); + memcg = page_memcg_rcu(page); + if (mem_cgroup_id(memcg) != memcg_id) + goto unlock; + + refs = token & (BIT(LRU_REFS_WIDTH) - 1); + if (refs && !workingset) + goto unlock; + + token >>= LRU_REFS_WIDTH; + lruvec = mem_cgroup_lruvec(memcg, pgdat); + lrugen = &lruvec->evictable; + min_seq = READ_ONCE(lrugen->min_seq[type]); + if (token != (min_seq & (EVICTION_MASK >> LRU_REFS_WIDTH))) + goto unlock; + + hist = lru_hist_from_seq(min_seq); + tier = lru_tier_from_refs(refs + workingset); + atomic_long_add(delta, &lrugen->refaulted[hist][type][tier]); + mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + type, delta); + + /* + * Tiers don't offer any protection to pages accessed via page tables. + * That's what generations do. Tiers can't fully protect pages after + * their numbers of accesses has exceeded the max value. Conservatively + * count these two conditions as stalls even though they might not + * indicate any real memory pressure. + */ + if (task_in_nonseq_fault() || refs + workingset == BIT(LRU_REFS_WIDTH)) { + SetPageWorkingset(page); + mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + type, delta); + } +unlock: + rcu_read_unlock(); +} + +#else + +static void *lru_gen_eviction(struct page *page) +{ + return NULL; +} + +static void lru_gen_refault(struct page *page, void *shadow) +{ +} + +#endif /* CONFIG_LRU_GEN */ + /** * workingset_age_nonresident - age non-resident entries as LRU ages * @lruvec: the lruvec that was aged @@ -264,10 +370,14 @@ void *workingset_eviction(struct page *p VM_BUG_ON_PAGE(page_count(page), page); VM_BUG_ON_PAGE(!PageLocked(page), page); + if (lru_gen_enabled()) + return lru_gen_eviction(page); + lruvec = mem_cgroup_lruvec(target_memcg, pgdat); /* XXX: target_memcg can be NULL, go through lruvec */ memcgid = mem_cgroup_id(lruvec_memcg(lruvec)); eviction = atomic_long_read(&lruvec->nonresident_age); + eviction >>= bucket_order; workingset_age_nonresident(lruvec, thp_nr_pages(page)); return pack_shadow(memcgid, pgdat, eviction, PageWorkingset(page)); } @@ -296,7 +406,13 @@ void workingset_refault(struct page *pag bool workingset; int memcgid; + if (lru_gen_enabled()) { + lru_gen_refault(page, shadow); + return; + } + unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &workingset); + eviction <<= bucket_order; rcu_read_lock(); /*