linux页框回收之shrink_node函数源码剖析
概述
《Linux内存回收入口_nginux的博客-CSDN博客》前文我们概略的描述了几种内存回收入口,我们知道几种回收入口最终都会调用进入shrink_node函数,本文将以Linux 5.9源码来描述shrink_node函数的源码实现。
函数调用流程图
scan_control数据结构
struct scan_control {/* How many pages shrink_list() should reclaim */unsigned long nr_to_reclaim;/** Nodemask of nodes allowed by the caller. If NULL, all nodes* are scanned.*/nodemask_t *nodemask;/** The memory cgroup that hit its limit and as a result is the* primary target of this reclaim invocation.*/struct mem_cgroup *target_mem_cgroup;/** Scan pressure balancing between anon and file LRUs*/unsigned long anon_cost;unsigned long file_cost;/* Can active pages be deactivated as part of reclaim? *///是否能从active lru列表进行deactivate的reclaim
#define DEACTIVATE_ANON 1
#define DEACTIVATE_FILE 2unsigned int may_deactivate:2;//如果是1:代表强制进行deactivate,即同时deactivate file和anon//如果是0,按需进行deactivate file或者anon,具体条件见下面shrink_node源码分析unsigned int force_deactivate:1;unsigned int skipped_deactivate:1;/* Writepage batching in laptop mode; RECLAIM_WRITE */unsigned int may_writepage:1;/* Can mapped pages be reclaimed? */unsigned int may_unmap:1;/* Can pages be swapped as part of reclaim? */unsigned int may_swap:1;/** Cgroups are not reclaimed below their configured memory.low,* unless we threaten to OOM. If any cgroups are skipped due to* memory.low and nothing was reclaimed, go back for memory.low.*/unsigned int memcg_low_reclaim:1;unsigned int memcg_low_skipped:1;unsigned int hibernation_mode:1;/* One of the zones is ready for compaction */unsigned int compaction_ready:1;/* There is easily reclaimable cold cache in the current node *///设置为1代表只回收file page cache,不回收aone pageunsigned int cache_trim_mode:1;/* The file pages on the current node are dangerously low *///设置1代表只回收aone page,不回收file pageunsigned int file_is_tiny:1;/* Allocation order */s8 order;/* Scan (total_size >> priority) pages at once */s8 priority;/* The highest zone to isolate pages for reclaim from */s8 reclaim_idx;/* This context's GFP mask */gfp_t gfp_mask;/* Incremented by the number of inactive pages that were scanned */unsigned long nr_scanned;/* Number of pages freed so far during a call to shrink_zones() */unsigned long nr_reclaimed;struct {unsigned int dirty;unsigned int unqueued_dirty;unsigned int congested;unsigned int writeback;unsigned int immediate;unsigned int file_taken;unsigned int taken;} nr;/* for recording the reclaimed slab by now */struct reclaim_state reclaim_state;
};
shrink_node函数
static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
{struct reclaim_state *reclaim_state = current->reclaim_state;unsigned long nr_reclaimed, nr_scanned;struct lruvec *target_lruvec;bool reclaimable = false;unsigned long file;target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);again:memset(&sc->nr, 0, sizeof(sc->nr));nr_reclaimed = sc->nr_reclaimed;nr_scanned = sc->nr_scanned;/** Determine the scan balance between anon and file LRUs.*/spin_lock_irq(&pgdat->lru_lock);sc->anon_cost = target_lruvec->anon_cost;sc->file_cost = target_lruvec->file_cost;spin_unlock_irq(&pgdat->lru_lock);/** Target desirable inactive:active list ratios for the anon* and file LRU lists.*/if (!sc->force_deactivate) {unsigned long refaults;refaults = lruvec_page_state(target_lruvec,WORKINGSET_ACTIVATE_ANON);//anon的refaults值比上次回收发生了变化,或者inactive anon很少,设置//DEACTIVATE_ANON表示需要deactivate anonif (refaults != target_lruvec->refaults[0] ||inactive_is_low(target_lruvec, LRU_INACTIVE_ANON))sc->may_deactivate |= DEACTIVATE_ANON;elsesc->may_deactivate &= ~DEACTIVATE_ANON;/** When refaults are being observed, it means a new* workingset is being established. Deactivate to get* rid of any stale active pages quickly.*/refaults = lruvec_page_state(target_lruvec,WORKINGSET_ACTIVATE_FILE);if (refaults != target_lruvec->refaults[1] ||inactive_is_low(target_lruvec, LRU_INACTIVE_FILE))sc->may_deactivate |= DEACTIVATE_FILE;elsesc->may_deactivate &= ~DEACTIVATE_FILE;} elsesc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE;/** If we have plenty of inactive file pages that aren't* thrashing, try to reclaim those first before touching* anonymous pages.*///file是inactive file的数量file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE);if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE))//只回收file page,影响get_scan_countsc->cache_trim_mode = 1;elsesc->cache_trim_mode = 0;/** Prevent the reclaimer from falling into the cache trap: as* cache pages start out inactive, every cache fault will tip* the scan balance towards the file LRU. And as the file LRU* shrinks, so does the window for rotation from references.* This means we have a runaway feedback loop where a tiny* thrashing file LRU becomes infinitely more attractive than* anon pages. Try to detect this based on file LRU size.*/if (!cgroup_reclaim(sc)) {unsigned long total_high_wmark = 0;unsigned long free, anon;int z;free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);file = node_page_state(pgdat, NR_ACTIVE_FILE) +node_page_state(pgdat, NR_INACTIVE_FILE);for (z = 0; z < MAX_NR_ZONES; z++) {struct zone *zone = &pgdat->node_zones[z];if (!managed_zone(zone))continue;total_high_wmark += high_wmark_pages(zone);}/** Consider anon: if that's low too, this isn't a* runaway file reclaim problem, but rather just* extreme pressure. Reclaim as per usual then.*/anon = node_page_state(pgdat, NR_INACTIVE_ANON);//设置1代表只回收aone page,不回收file pagesc->file_is_tiny =file + free <= total_high_wmark &&!(sc->may_deactivate & DEACTIVATE_ANON) &&anon >> sc->priority;}//回收的核心函数,后面文章专门分析shrink_node_memcgs(pgdat, sc);if (reclaim_state) {sc->nr_reclaimed += reclaim_state->reclaimed_slab;reclaim_state->reclaimed_slab = 0;}/* Record the subtree's reclaim efficiency */vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,sc->nr_scanned - nr_scanned,sc->nr_reclaimed - nr_reclaimed);//这一轮回收到了页面if (sc->nr_reclaimed - nr_reclaimed)reclaimable = true;//只允许kswapd线程设置这些flag,因为只有kswapd能clear这些flag,避免混乱//比如memcg reclaim也能设置,没法保证kswapd肯定会被wakeup去clear这些标志if (current_is_kswapd()) {/** If reclaim is isolating dirty pages under writeback,* it implies that the long-lived page allocation rate* is exceeding the page laundering rate. Either the* global limits are not being effective at throttling* processes due to the page distribution throughout* zones or there is heavy usage of a slow backing* device. The only option is to throttle from reclaim* context which is not ideal as there is no guarantee* the dirtying process is throttled in the same way* balance_dirty_pages() manages.** Once a node is flagged PGDAT_WRITEBACK, kswapd will* count the number of pages under pages flagged for* immediate reclaim and stall if any are encountered* in the nr_immediate check below.*///设置PGDAT_DIRTY代表reclaim发现很多页面正在回写if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken)set_bit(PGDAT_WRITEBACK, &pgdat->flags);/* Allow kswapd to start writing pages during reclaim.*/设置PGDAT_DIRTY代表reclaim发现很多脏页if (sc->nr.unqueued_dirty == sc->nr.file_taken)set_bit(PGDAT_DIRTY, &pgdat->flags);/** If kswapd scans pages marked for immediate* reclaim and under writeback (nr_immediate), it* implies that pages are cycling through the LRU* faster than they are written so also forcibly stall.*/if (sc->nr.immediate)congestion_wait(BLK_RW_ASYNC, HZ/10);}/** Tag a node/memcg as congested if all the dirty pages* scanned were backed by a congested BDI and* wait_iff_congested will stall.** Legacy memcg will stall in page writeback so avoid forcibly* stalling in wait_iff_congested().*///只允许kswapd线程设置LRUVEC_CONGESTED,因为只有kswapd能clear LRUVEC_CONGESTED,//比如memcg reclaim也能设置,没法保证kswap能唤醒去clear LRUVEC_CONGESTED,导致//direct reclaim阻塞在wait_iff_congestedif ((current_is_kswapd() ||(cgroup_reclaim(sc) && writeback_throttling_sane(sc))) &&sc->nr.dirty && sc->nr.dirty == sc->nr.congested)set_bit(LRUVEC_CONGESTED, &target_lruvec->flags);/** Stall direct reclaim for IO completions if underlying BDIs* and node is congested. Allow kswapd to continue until it* starts encountering unqueued dirty pages or cycling through* the LRU too quickly.*///如果是非kswapd线程,且判定当前回收设置过拥塞flag,就要等待,所以direct reclaim//会被阻塞if (!current_is_kswapd() && current_may_throttle() &&!sc->hibernation_mode &&test_bit(LRUVEC_CONGESTED, &target_lruvec->flags))wait_iff_congested(BLK_RW_ASYNC, HZ/10);//如果需要继续回收,就goto again继续if (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,sc))goto again;/** Kswapd gives up on balancing particular nodes after too* many failures to reclaim anything from them and goes to* sleep. On reclaim progress, reset the failure counter. A* successful direct reclaim run will revive a dormant kswapd.*/if (reclaimable)pgdat->kswapd_failures = 0;
}
should_continue_reclaim
/** Reclaim/compaction is used for high-order allocation requests. It reclaims* order-0 pages before compacting the zone. should_continue_reclaim() returns* true if more pages should be reclaimed such that when the page allocator* calls try_to_compact_pages() that it will have enough free pages to succeed.* It will give up earlier than that if there is difficulty reclaiming pages.*/
static inline bool should_continue_reclaim(struct pglist_data *pgdat,unsigned long nr_reclaimed,struct scan_control *sc)
{unsigned long pages_for_compaction;unsigned long inactive_lru_pages;int z;/* If not in reclaim/compaction mode, stop */if (!in_reclaim_compaction(sc))return false;/** Stop if we failed to reclaim any pages from the last SWAP_CLUSTER_MAX* number of pages that were scanned. This will return to the caller* with the risk reclaim/compaction and the resulting allocation attempt* fails. In the past we have tried harder for __GFP_RETRY_MAYFAIL* allocations through requiring that the full LRU list has been scanned* first, by assuming that zero delta of sc->nr_scanned means full LRU* scan, but that approximation was wrong, and there were corner cases* where always a non-zero amount of pages were scanned.*/if (!nr_reclaimed)return false;//compaction_suitable会检查水位是否已满足条件(要根据orderPAGE_ALLOC_COSTLY_ORDER//使用不同的watermark,如果不满足就不会返回success/continue/* If compaction would go ahead or the allocation would succeed, stop */for (z = 0; z <= sc->reclaim_idx; z++) {struct zone *zone = &pgdat->node_zones[z];if (!managed_zone(zone))continue;//满足了水位return false,代表不要继续shrink_node了switch (compaction_suitable(zone, sc->order, 0, sc->reclaim_idx)) {case COMPACT_SUCCESS:case COMPACT_CONTINUE:return false;default:/* check next zone */;}}/** If we have not reclaimed enough pages for compaction and the* inactive lists are large enough, continue reclaiming*///上面水位检查不通过,且也没有reclaim足够的page来做compaction,那就继续reclaim吧pages_for_compaction = compact_gap(sc->order);inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE);if (get_nr_swap_pages() > 0)inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON);return inactive_lru_pages > pages_for_compaction;
}
compaction_suitable会判定当前水位是否满足order申请,如果满足了那么COMPACT_SUCCESS,说明也不需要继续compact了;如果不满了说明还没有回到足够order申请的内存,逻辑会继续往下走到inactive_lru_pages > pages_for_compaction逻辑判定,如果inactive lru中有大于2被申请order的页面,那就继续扫描回收
参考文章:
[PATCH v2 4/4] mm/vmscan: Don't mess with pgdat->flags in memcg reclaim. - Andrey Ryabinin
Linux 内存管理_workingset内存_jianchwa的博客-CSDN博客
相关文章:

linux页框回收之shrink_node函数源码剖析
概述 《Linux内存回收入口_nginux的博客-CSDN博客》前文我们概略的描述了几种内存回收入口,我们知道几种回收入口最终都会调用进入shrink_node函数,本文将以Linux 5.9源码来描述shrink_node函数的源码实现。 函数调用流程图 scan_control数据结构 str…...

网络运维基础问题及解答
前言 本篇文章是对于网络运维基础技能的一些常见问题的解答,希望能够为进行期末复习或者对网络运维感兴趣的同学或专业人员提供一定的帮助。 问题及解答 1. 列举 3 种常用字符编码,简述怎样在 str 和 bytes 之间进行编码和解码。 答:常用的…...
【RabbitMQ】之保证数据不丢失方案
目录 一、数据丢失场景二、数据可靠性方案 1、生产者丢失消息解决方案2、MQ 队列丢失消息解决方案3、消费者丢失消息解决方案 一、数据丢失场景 MQ 消息数据完整的链路为:从 Producer 发送消息到 RabbitMQ 服务器中,再由 Broker 服务的 Exchange 根据…...

插入排序算法
插入排序 算法说明与代码实现: 以下是使用Go语言实现的插入排序算法示例代码: package mainimport "fmt"func insertionSort(arr []int) {n : len(arr)for i : 1; i < n; i {key : arr[i]j : i - 1for j > 0 && arr[j] > …...

Linux标准库API
目录 1.字符串函数 2.数据转换函数 3.格式化输入输出函数 4.权限控制函数 5.IO函数 6.进程控制函数 7.文件和目录函数 1.字符串函数 2.数据转换函数 3.格式化输入输出函数 #include<stdarg.h>void test(const char * format , ...){va_list ap;va_start(ap,format…...

腾讯云—自动挂载云盘
腾讯云,稍微麻烦了点。 腾讯云服务器,镜像为opencloudos 8。 ### 1、挂载云盘bash #首先通过以下命令,能够看到新的数据盘,如果不能需要通过腾讯云控制台卸载后,重新挂载,并重启服务器。 fdisk -l#为 /dev…...

为Win12做准备?微软Win11 23H2将集成AI助手:GPT4免费用
微软日前确认今年4季度推出Win11 23H2,这是Win11第二个年度更新。 Win11 23H2具体有哪些功能升级,现在还不好说,但它会集成微软的Copilot,它很容易让人想到多年前的“曲别针”助手,但这次是AI技术加持的,Co…...

Opencv Win10+Qt+Cmake 开发环境搭建
文章目录 一.Opencv安装二.Qt搭建opencv开发环境 一.Opencv安装 官网下载Opencv安装包 双击下载的软件进行解压 3. 系统环境变量添加 二.Qt搭建opencv开发环境 创建一个新的Qt项目(Non-Qt Project) 打开创建好的项目中的CMakeLists.txt,添加如下代码 # openc…...
Matlab实现光伏仿真(附上30个完整仿真源码)
光伏发电电池模型是描述光伏电池在不同条件下产生电能的数学模型。该模型可以用于预测光伏电池的输出功率,并为优化光伏电池系统设计和控制提供基础。本文将介绍如何使用Matlab实现光伏发电电池模型。 文章目录 1、光伏发电电池模型2、使用Matlab实现光伏发电电池模…...
JSON.stringify()与JSON.parse()
JSON.parse() 方法用来解析 JSON 字符串 onst json {"result":true, "count":42}; const obj JSON.parse(json); console.log(typeof(json)) //string console.log(typeof(obj)) //objJSON.stringify() 方法将一个 JavaScript 对象或值转换为 JSON 字…...

neo4j教程-安装部署
neo4j教程-安装部署 Neo4j的关键概念和特点 •Neo4j是一个开源的NoSQL图形存储数据库,可为应用程序提供支持ACID的后端。Neo4j的开发始于2003年,自2007年转变为开源图形数据库模型。程序员使用的是路由器和关系的灵活网络结构,而不是静态表…...

网络面试合集
传输层的数据结构是什么? 就是在问他的协议格式:UDP&TCP 2.1.1三次握手 通信前,要先建立连接,确保双方都是在线,具有数据收发的能力。 2.1.2四次挥手 通信结束后,会有一个断开连接的过程࿰…...

java+springboot+mysql智慧办公OA管理系统
项目介绍: 使用javaspringbootmysql开发的智慧办公OA管理系统,系统包含超级管理员,系统管理员、员工角色,功能如下: 超级管理员:管理员管理;部门管理;职位管理;员工管理…...

【教程】Tkinter实现Python软件自动更新与提醒
转载请注明出处:小锋学长生活大爆炸[xfxuezhang.cn] 文件下载:https://download.csdn.net/download/sxf1061700625/88134425 示例演示: 参考代码: import os import _thread import shutil import subprocess import sys import …...

音频深度学习变得简单:自动语音识别 (ASR),它是如何工作的
一、说明 在过去的几年里,随着Google Home,Amazon Echo,Siri,Cortana等的普及,语音助手已经无处不在。这些是自动语音识别 (ASR) 最著名的示例。此类应用程序从某种语言的语音音频剪辑开始&…...

反射简述
什么是反射反射在java中起到什么样的作用获取class对象的三种方式反射的优缺点图 什么是反射 JAVA反射机制是在运行状态中,对于任意一个类,都能够知道这个类的所有属性和方法;对于任意一个对象,都能够调用它的任意一个方法和属性&…...

Kotlin泛型的协变与逆变
以下内容摘自郭霖《第一行代码》第三版 泛型的协变 一个泛型类或者泛型接口中的方法,它的参数列表是接收数据的地方,因此可以称它为in位置,而它的返回值是输出数据的地方,因此可以称它为out位置。 先定义三个类: op…...

【后端面经】微服务构架 (1-6) | 隔离:如何确保心悦会员体验无忧?唱响隔离的鸣奏曲!
文章目录 一、前置知识1、什么是隔离?2、为什么要隔离?3、怎么进行隔离?A) 机房隔离B) 实例隔离C) 分组隔离D) 连接池隔离 与 线程池隔离E) 信号量隔离F) 第三方依赖隔离二、面试环节1、面试准备2、基本思路3、亮点方案A) 慢任务隔离B) 制作库与线上库分离三、章节总结 …...

复习之kickstart无人职守安装脚本
一、kickstart简介 kickstart是红帽发行版中的一种安装方式,它通过以配置文件的方式来记录linux系统安装的各项参数和想要安装的软件。只要配置正确,整个安装过程中无需人工交互参与,达到无人值守安装的目的。 二、kickstar文件的生成 进入/…...

CSS动画——实现波浪摇摆效果...
一、效果展示 以下主要实现四个动画: 元素上下摇摆动画波浪上下摇摆动画气泡上升及消失动画连续气泡右飘动画 二、实现思路 这里主要讲一下波浪上下摇摆动画和连续气泡右飘动画的实现思路 这里拿一张波浪图来举例解释实现波浪动画的思路: 波浪的摇…...

MFC内存泄露
1、泄露代码示例 void X::SetApplicationBtn() {CMFCRibbonApplicationButton* pBtn GetApplicationButton();// 获取 Ribbon Bar 指针// 创建自定义按钮CCustomRibbonAppButton* pCustomButton new CCustomRibbonAppButton();pCustomButton->SetImage(IDB_BITMAP_Jdp26)…...
基于服务器使用 apt 安装、配置 Nginx
🧾 一、查看可安装的 Nginx 版本 首先,你可以运行以下命令查看可用版本: apt-cache madison nginx-core输出示例: nginx-core | 1.18.0-6ubuntu14.6 | http://archive.ubuntu.com/ubuntu focal-updates/main amd64 Packages ng…...
Java多线程实现之Callable接口深度解析
Java多线程实现之Callable接口深度解析 一、Callable接口概述1.1 接口定义1.2 与Runnable接口的对比1.3 Future接口与FutureTask类 二、Callable接口的基本使用方法2.1 传统方式实现Callable接口2.2 使用Lambda表达式简化Callable实现2.3 使用FutureTask类执行Callable任务 三、…...
【Web 进阶篇】优雅的接口设计:统一响应、全局异常处理与参数校验
系列回顾: 在上一篇中,我们成功地为应用集成了数据库,并使用 Spring Data JPA 实现了基本的 CRUD API。我们的应用现在能“记忆”数据了!但是,如果你仔细审视那些 API,会发现它们还很“粗糙”:有…...

分布式增量爬虫实现方案
之前我们在讨论的是分布式爬虫如何实现增量爬取。增量爬虫的目标是只爬取新产生或发生变化的页面,避免重复抓取,以节省资源和时间。 在分布式环境下,增量爬虫的实现需要考虑多个爬虫节点之间的协调和去重。 另一种思路:将增量判…...

SAP学习笔记 - 开发26 - 前端Fiori开发 OData V2 和 V4 的差异 (Deepseek整理)
上一章用到了V2 的概念,其实 Fiori当中还有 V4,咱们这一章来总结一下 V2 和 V4。 SAP学习笔记 - 开发25 - 前端Fiori开发 Remote OData Service(使用远端Odata服务),代理中间件(ui5-middleware-simpleproxy)-CSDN博客…...

网站指纹识别
网站指纹识别 网站的最基本组成:服务器(操作系统)、中间件(web容器)、脚本语言、数据厍 为什么要了解这些?举个例子:发现了一个文件读取漏洞,我们需要读/etc/passwd,如…...

2025年渗透测试面试题总结-腾讯[实习]科恩实验室-安全工程师(题目+回答)
安全领域各种资源,学习文档,以及工具分享、前沿信息分享、POC、EXP分享。不定期分享各种好玩的项目及好用的工具,欢迎关注。 目录 腾讯[实习]科恩实验室-安全工程师 一、网络与协议 1. TCP三次握手 2. SYN扫描原理 3. HTTPS证书机制 二…...

day36-多路IO复用
一、基本概念 (服务器多客户端模型) 定义:单线程或单进程同时监测若干个文件描述符是否可以执行IO操作的能力 作用:应用程序通常需要处理来自多条事件流中的事件,比如我现在用的电脑,需要同时处理键盘鼠标…...
go 里面的指针
指针 在 Go 中,指针(pointer)是一个变量的内存地址,就像 C 语言那样: a : 10 p : &a // p 是一个指向 a 的指针 fmt.Println(*p) // 输出 10,通过指针解引用• &a 表示获取变量 a 的地址 p 表示…...