linux页框回收之shrink_node函数源码剖析
概述
《Linux内存回收入口_nginux的博客-CSDN博客》前文我们概略的描述了几种内存回收入口,我们知道几种回收入口最终都会调用进入shrink_node函数,本文将以Linux 5.9源码来描述shrink_node函数的源码实现。
函数调用流程图
scan_control数据结构
struct scan_control {/* How many pages shrink_list() should reclaim */unsigned long nr_to_reclaim;/** Nodemask of nodes allowed by the caller. If NULL, all nodes* are scanned.*/nodemask_t *nodemask;/** The memory cgroup that hit its limit and as a result is the* primary target of this reclaim invocation.*/struct mem_cgroup *target_mem_cgroup;/** Scan pressure balancing between anon and file LRUs*/unsigned long anon_cost;unsigned long file_cost;/* Can active pages be deactivated as part of reclaim? *///是否能从active lru列表进行deactivate的reclaim
#define DEACTIVATE_ANON 1
#define DEACTIVATE_FILE 2unsigned int may_deactivate:2;//如果是1:代表强制进行deactivate,即同时deactivate file和anon//如果是0,按需进行deactivate file或者anon,具体条件见下面shrink_node源码分析unsigned int force_deactivate:1;unsigned int skipped_deactivate:1;/* Writepage batching in laptop mode; RECLAIM_WRITE */unsigned int may_writepage:1;/* Can mapped pages be reclaimed? */unsigned int may_unmap:1;/* Can pages be swapped as part of reclaim? */unsigned int may_swap:1;/** Cgroups are not reclaimed below their configured memory.low,* unless we threaten to OOM. If any cgroups are skipped due to* memory.low and nothing was reclaimed, go back for memory.low.*/unsigned int memcg_low_reclaim:1;unsigned int memcg_low_skipped:1;unsigned int hibernation_mode:1;/* One of the zones is ready for compaction */unsigned int compaction_ready:1;/* There is easily reclaimable cold cache in the current node *///设置为1代表只回收file page cache,不回收aone pageunsigned int cache_trim_mode:1;/* The file pages on the current node are dangerously low *///设置1代表只回收aone page,不回收file pageunsigned int file_is_tiny:1;/* Allocation order */s8 order;/* Scan (total_size >> priority) pages at once */s8 priority;/* The highest zone to isolate pages for reclaim from */s8 reclaim_idx;/* This context's GFP mask */gfp_t gfp_mask;/* Incremented by the number of inactive pages that were scanned */unsigned long nr_scanned;/* Number of pages freed so far during a call to shrink_zones() */unsigned long nr_reclaimed;struct {unsigned int dirty;unsigned int unqueued_dirty;unsigned int congested;unsigned int writeback;unsigned int immediate;unsigned int file_taken;unsigned int taken;} nr;/* for recording the reclaimed slab by now */struct reclaim_state reclaim_state;
};
shrink_node函数
static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
{struct reclaim_state *reclaim_state = current->reclaim_state;unsigned long nr_reclaimed, nr_scanned;struct lruvec *target_lruvec;bool reclaimable = false;unsigned long file;target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);again:memset(&sc->nr, 0, sizeof(sc->nr));nr_reclaimed = sc->nr_reclaimed;nr_scanned = sc->nr_scanned;/** Determine the scan balance between anon and file LRUs.*/spin_lock_irq(&pgdat->lru_lock);sc->anon_cost = target_lruvec->anon_cost;sc->file_cost = target_lruvec->file_cost;spin_unlock_irq(&pgdat->lru_lock);/** Target desirable inactive:active list ratios for the anon* and file LRU lists.*/if (!sc->force_deactivate) {unsigned long refaults;refaults = lruvec_page_state(target_lruvec,WORKINGSET_ACTIVATE_ANON);//anon的refaults值比上次回收发生了变化,或者inactive anon很少,设置//DEACTIVATE_ANON表示需要deactivate anonif (refaults != target_lruvec->refaults[0] ||inactive_is_low(target_lruvec, LRU_INACTIVE_ANON))sc->may_deactivate |= DEACTIVATE_ANON;elsesc->may_deactivate &= ~DEACTIVATE_ANON;/** When refaults are being observed, it means a new* workingset is being established. Deactivate to get* rid of any stale active pages quickly.*/refaults = lruvec_page_state(target_lruvec,WORKINGSET_ACTIVATE_FILE);if (refaults != target_lruvec->refaults[1] ||inactive_is_low(target_lruvec, LRU_INACTIVE_FILE))sc->may_deactivate |= DEACTIVATE_FILE;elsesc->may_deactivate &= ~DEACTIVATE_FILE;} elsesc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE;/** If we have plenty of inactive file pages that aren't* thrashing, try to reclaim those first before touching* anonymous pages.*///file是inactive file的数量file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE);if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE))//只回收file page,影响get_scan_countsc->cache_trim_mode = 1;elsesc->cache_trim_mode = 0;/** Prevent the reclaimer from falling into the cache trap: as* cache pages start out inactive, every cache fault will tip* the scan balance towards the file LRU. And as the file LRU* shrinks, so does the window for rotation from references.* This means we have a runaway feedback loop where a tiny* thrashing file LRU becomes infinitely more attractive than* anon pages. Try to detect this based on file LRU size.*/if (!cgroup_reclaim(sc)) {unsigned long total_high_wmark = 0;unsigned long free, anon;int z;free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);file = node_page_state(pgdat, NR_ACTIVE_FILE) +node_page_state(pgdat, NR_INACTIVE_FILE);for (z = 0; z < MAX_NR_ZONES; z++) {struct zone *zone = &pgdat->node_zones[z];if (!managed_zone(zone))continue;total_high_wmark += high_wmark_pages(zone);}/** Consider anon: if that's low too, this isn't a* runaway file reclaim problem, but rather just* extreme pressure. Reclaim as per usual then.*/anon = node_page_state(pgdat, NR_INACTIVE_ANON);//设置1代表只回收aone page,不回收file pagesc->file_is_tiny =file + free <= total_high_wmark &&!(sc->may_deactivate & DEACTIVATE_ANON) &&anon >> sc->priority;}//回收的核心函数,后面文章专门分析shrink_node_memcgs(pgdat, sc);if (reclaim_state) {sc->nr_reclaimed += reclaim_state->reclaimed_slab;reclaim_state->reclaimed_slab = 0;}/* Record the subtree's reclaim efficiency */vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,sc->nr_scanned - nr_scanned,sc->nr_reclaimed - nr_reclaimed);//这一轮回收到了页面if (sc->nr_reclaimed - nr_reclaimed)reclaimable = true;//只允许kswapd线程设置这些flag,因为只有kswapd能clear这些flag,避免混乱//比如memcg reclaim也能设置,没法保证kswapd肯定会被wakeup去clear这些标志if (current_is_kswapd()) {/** If reclaim is isolating dirty pages under writeback,* it implies that the long-lived page allocation rate* is exceeding the page laundering rate. Either the* global limits are not being effective at throttling* processes due to the page distribution throughout* zones or there is heavy usage of a slow backing* device. The only option is to throttle from reclaim* context which is not ideal as there is no guarantee* the dirtying process is throttled in the same way* balance_dirty_pages() manages.** Once a node is flagged PGDAT_WRITEBACK, kswapd will* count the number of pages under pages flagged for* immediate reclaim and stall if any are encountered* in the nr_immediate check below.*///设置PGDAT_DIRTY代表reclaim发现很多页面正在回写if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken)set_bit(PGDAT_WRITEBACK, &pgdat->flags);/* Allow kswapd to start writing pages during reclaim.*/设置PGDAT_DIRTY代表reclaim发现很多脏页if (sc->nr.unqueued_dirty == sc->nr.file_taken)set_bit(PGDAT_DIRTY, &pgdat->flags);/** If kswapd scans pages marked for immediate* reclaim and under writeback (nr_immediate), it* implies that pages are cycling through the LRU* faster than they are written so also forcibly stall.*/if (sc->nr.immediate)congestion_wait(BLK_RW_ASYNC, HZ/10);}/** Tag a node/memcg as congested if all the dirty pages* scanned were backed by a congested BDI and* wait_iff_congested will stall.** Legacy memcg will stall in page writeback so avoid forcibly* stalling in wait_iff_congested().*///只允许kswapd线程设置LRUVEC_CONGESTED,因为只有kswapd能clear LRUVEC_CONGESTED,//比如memcg reclaim也能设置,没法保证kswap能唤醒去clear LRUVEC_CONGESTED,导致//direct reclaim阻塞在wait_iff_congestedif ((current_is_kswapd() ||(cgroup_reclaim(sc) && writeback_throttling_sane(sc))) &&sc->nr.dirty && sc->nr.dirty == sc->nr.congested)set_bit(LRUVEC_CONGESTED, &target_lruvec->flags);/** Stall direct reclaim for IO completions if underlying BDIs* and node is congested. Allow kswapd to continue until it* starts encountering unqueued dirty pages or cycling through* the LRU too quickly.*///如果是非kswapd线程,且判定当前回收设置过拥塞flag,就要等待,所以direct reclaim//会被阻塞if (!current_is_kswapd() && current_may_throttle() &&!sc->hibernation_mode &&test_bit(LRUVEC_CONGESTED, &target_lruvec->flags))wait_iff_congested(BLK_RW_ASYNC, HZ/10);//如果需要继续回收,就goto again继续if (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,sc))goto again;/** Kswapd gives up on balancing particular nodes after too* many failures to reclaim anything from them and goes to* sleep. On reclaim progress, reset the failure counter. A* successful direct reclaim run will revive a dormant kswapd.*/if (reclaimable)pgdat->kswapd_failures = 0;
}
should_continue_reclaim
/** Reclaim/compaction is used for high-order allocation requests. It reclaims* order-0 pages before compacting the zone. should_continue_reclaim() returns* true if more pages should be reclaimed such that when the page allocator* calls try_to_compact_pages() that it will have enough free pages to succeed.* It will give up earlier than that if there is difficulty reclaiming pages.*/
static inline bool should_continue_reclaim(struct pglist_data *pgdat,unsigned long nr_reclaimed,struct scan_control *sc)
{unsigned long pages_for_compaction;unsigned long inactive_lru_pages;int z;/* If not in reclaim/compaction mode, stop */if (!in_reclaim_compaction(sc))return false;/** Stop if we failed to reclaim any pages from the last SWAP_CLUSTER_MAX* number of pages that were scanned. This will return to the caller* with the risk reclaim/compaction and the resulting allocation attempt* fails. In the past we have tried harder for __GFP_RETRY_MAYFAIL* allocations through requiring that the full LRU list has been scanned* first, by assuming that zero delta of sc->nr_scanned means full LRU* scan, but that approximation was wrong, and there were corner cases* where always a non-zero amount of pages were scanned.*/if (!nr_reclaimed)return false;//compaction_suitable会检查水位是否已满足条件(要根据orderPAGE_ALLOC_COSTLY_ORDER//使用不同的watermark,如果不满足就不会返回success/continue/* If compaction would go ahead or the allocation would succeed, stop */for (z = 0; z <= sc->reclaim_idx; z++) {struct zone *zone = &pgdat->node_zones[z];if (!managed_zone(zone))continue;//满足了水位return false,代表不要继续shrink_node了switch (compaction_suitable(zone, sc->order, 0, sc->reclaim_idx)) {case COMPACT_SUCCESS:case COMPACT_CONTINUE:return false;default:/* check next zone */;}}/** If we have not reclaimed enough pages for compaction and the* inactive lists are large enough, continue reclaiming*///上面水位检查不通过,且也没有reclaim足够的page来做compaction,那就继续reclaim吧pages_for_compaction = compact_gap(sc->order);inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE);if (get_nr_swap_pages() > 0)inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON);return inactive_lru_pages > pages_for_compaction;
}
compaction_suitable会判定当前水位是否满足order申请,如果满足了那么COMPACT_SUCCESS,说明也不需要继续compact了;如果不满了说明还没有回到足够order申请的内存,逻辑会继续往下走到inactive_lru_pages > pages_for_compaction逻辑判定,如果inactive lru中有大于2被申请order的页面,那就继续扫描回收
参考文章:
[PATCH v2 4/4] mm/vmscan: Don't mess with pgdat->flags in memcg reclaim. - Andrey Ryabinin
Linux 内存管理_workingset内存_jianchwa的博客-CSDN博客
相关文章:

linux页框回收之shrink_node函数源码剖析
概述 《Linux内存回收入口_nginux的博客-CSDN博客》前文我们概略的描述了几种内存回收入口,我们知道几种回收入口最终都会调用进入shrink_node函数,本文将以Linux 5.9源码来描述shrink_node函数的源码实现。 函数调用流程图 scan_control数据结构 str…...

网络运维基础问题及解答
前言 本篇文章是对于网络运维基础技能的一些常见问题的解答,希望能够为进行期末复习或者对网络运维感兴趣的同学或专业人员提供一定的帮助。 问题及解答 1. 列举 3 种常用字符编码,简述怎样在 str 和 bytes 之间进行编码和解码。 答:常用的…...
【RabbitMQ】之保证数据不丢失方案
目录 一、数据丢失场景二、数据可靠性方案 1、生产者丢失消息解决方案2、MQ 队列丢失消息解决方案3、消费者丢失消息解决方案 一、数据丢失场景 MQ 消息数据完整的链路为:从 Producer 发送消息到 RabbitMQ 服务器中,再由 Broker 服务的 Exchange 根据…...

插入排序算法
插入排序 算法说明与代码实现: 以下是使用Go语言实现的插入排序算法示例代码: package mainimport "fmt"func insertionSort(arr []int) {n : len(arr)for i : 1; i < n; i {key : arr[i]j : i - 1for j > 0 && arr[j] > …...

Linux标准库API
目录 1.字符串函数 2.数据转换函数 3.格式化输入输出函数 4.权限控制函数 5.IO函数 6.进程控制函数 7.文件和目录函数 1.字符串函数 2.数据转换函数 3.格式化输入输出函数 #include<stdarg.h>void test(const char * format , ...){va_list ap;va_start(ap,format…...

腾讯云—自动挂载云盘
腾讯云,稍微麻烦了点。 腾讯云服务器,镜像为opencloudos 8。 ### 1、挂载云盘bash #首先通过以下命令,能够看到新的数据盘,如果不能需要通过腾讯云控制台卸载后,重新挂载,并重启服务器。 fdisk -l#为 /dev…...

为Win12做准备?微软Win11 23H2将集成AI助手:GPT4免费用
微软日前确认今年4季度推出Win11 23H2,这是Win11第二个年度更新。 Win11 23H2具体有哪些功能升级,现在还不好说,但它会集成微软的Copilot,它很容易让人想到多年前的“曲别针”助手,但这次是AI技术加持的,Co…...

Opencv Win10+Qt+Cmake 开发环境搭建
文章目录 一.Opencv安装二.Qt搭建opencv开发环境 一.Opencv安装 官网下载Opencv安装包 双击下载的软件进行解压 3. 系统环境变量添加 二.Qt搭建opencv开发环境 创建一个新的Qt项目(Non-Qt Project) 打开创建好的项目中的CMakeLists.txt,添加如下代码 # openc…...
Matlab实现光伏仿真(附上30个完整仿真源码)
光伏发电电池模型是描述光伏电池在不同条件下产生电能的数学模型。该模型可以用于预测光伏电池的输出功率,并为优化光伏电池系统设计和控制提供基础。本文将介绍如何使用Matlab实现光伏发电电池模型。 文章目录 1、光伏发电电池模型2、使用Matlab实现光伏发电电池模…...
JSON.stringify()与JSON.parse()
JSON.parse() 方法用来解析 JSON 字符串 onst json {"result":true, "count":42}; const obj JSON.parse(json); console.log(typeof(json)) //string console.log(typeof(obj)) //objJSON.stringify() 方法将一个 JavaScript 对象或值转换为 JSON 字…...

neo4j教程-安装部署
neo4j教程-安装部署 Neo4j的关键概念和特点 •Neo4j是一个开源的NoSQL图形存储数据库,可为应用程序提供支持ACID的后端。Neo4j的开发始于2003年,自2007年转变为开源图形数据库模型。程序员使用的是路由器和关系的灵活网络结构,而不是静态表…...

网络面试合集
传输层的数据结构是什么? 就是在问他的协议格式:UDP&TCP 2.1.1三次握手 通信前,要先建立连接,确保双方都是在线,具有数据收发的能力。 2.1.2四次挥手 通信结束后,会有一个断开连接的过程࿰…...

java+springboot+mysql智慧办公OA管理系统
项目介绍: 使用javaspringbootmysql开发的智慧办公OA管理系统,系统包含超级管理员,系统管理员、员工角色,功能如下: 超级管理员:管理员管理;部门管理;职位管理;员工管理…...

【教程】Tkinter实现Python软件自动更新与提醒
转载请注明出处:小锋学长生活大爆炸[xfxuezhang.cn] 文件下载:https://download.csdn.net/download/sxf1061700625/88134425 示例演示: 参考代码: import os import _thread import shutil import subprocess import sys import …...

音频深度学习变得简单:自动语音识别 (ASR),它是如何工作的
一、说明 在过去的几年里,随着Google Home,Amazon Echo,Siri,Cortana等的普及,语音助手已经无处不在。这些是自动语音识别 (ASR) 最著名的示例。此类应用程序从某种语言的语音音频剪辑开始&…...

反射简述
什么是反射反射在java中起到什么样的作用获取class对象的三种方式反射的优缺点图 什么是反射 JAVA反射机制是在运行状态中,对于任意一个类,都能够知道这个类的所有属性和方法;对于任意一个对象,都能够调用它的任意一个方法和属性&…...

Kotlin泛型的协变与逆变
以下内容摘自郭霖《第一行代码》第三版 泛型的协变 一个泛型类或者泛型接口中的方法,它的参数列表是接收数据的地方,因此可以称它为in位置,而它的返回值是输出数据的地方,因此可以称它为out位置。 先定义三个类: op…...

【后端面经】微服务构架 (1-6) | 隔离:如何确保心悦会员体验无忧?唱响隔离的鸣奏曲!
文章目录 一、前置知识1、什么是隔离?2、为什么要隔离?3、怎么进行隔离?A) 机房隔离B) 实例隔离C) 分组隔离D) 连接池隔离 与 线程池隔离E) 信号量隔离F) 第三方依赖隔离二、面试环节1、面试准备2、基本思路3、亮点方案A) 慢任务隔离B) 制作库与线上库分离三、章节总结 …...

复习之kickstart无人职守安装脚本
一、kickstart简介 kickstart是红帽发行版中的一种安装方式,它通过以配置文件的方式来记录linux系统安装的各项参数和想要安装的软件。只要配置正确,整个安装过程中无需人工交互参与,达到无人值守安装的目的。 二、kickstar文件的生成 进入/…...

CSS动画——实现波浪摇摆效果...
一、效果展示 以下主要实现四个动画: 元素上下摇摆动画波浪上下摇摆动画气泡上升及消失动画连续气泡右飘动画 二、实现思路 这里主要讲一下波浪上下摇摆动画和连续气泡右飘动画的实现思路 这里拿一张波浪图来举例解释实现波浪动画的思路: 波浪的摇…...

观成科技:隐蔽隧道工具Ligolo-ng加密流量分析
1.工具介绍 Ligolo-ng是一款由go编写的高效隧道工具,该工具基于TUN接口实现其功能,利用反向TCP/TLS连接建立一条隐蔽的通信信道,支持使用Let’s Encrypt自动生成证书。Ligolo-ng的通信隐蔽性体现在其支持多种连接方式,适应复杂网…...

stm32G473的flash模式是单bank还是双bank?
今天突然有人stm32G473的flash模式是单bank还是双bank?由于时间太久,我真忘记了。搜搜发现,还真有人和我一样。见下面的链接:https://shequ.stmicroelectronics.cn/forum.php?modviewthread&tid644563 根据STM32G4系列参考手…...

基于距离变化能量开销动态调整的WSN低功耗拓扑控制开销算法matlab仿真
目录 1.程序功能描述 2.测试软件版本以及运行结果展示 3.核心程序 4.算法仿真参数 5.算法理论概述 6.参考文献 7.完整程序 1.程序功能描述 通过动态调整节点通信的能量开销,平衡网络负载,延长WSN生命周期。具体通过建立基于距离的能量消耗模型&am…...
vue3 字体颜色设置的多种方式
在Vue 3中设置字体颜色可以通过多种方式实现,这取决于你是想在组件内部直接设置,还是在CSS/SCSS/LESS等样式文件中定义。以下是几种常见的方法: 1. 内联样式 你可以直接在模板中使用style绑定来设置字体颜色。 <template><div :s…...
【git】把本地更改提交远程新分支feature_g
创建并切换新分支 git checkout -b feature_g 添加并提交更改 git add . git commit -m “实现图片上传功能” 推送到远程 git push -u origin feature_g...

CMake 从 GitHub 下载第三方库并使用
有时我们希望直接使用 GitHub 上的开源库,而不想手动下载、编译和安装。 可以利用 CMake 提供的 FetchContent 模块来实现自动下载、构建和链接第三方库。 FetchContent 命令官方文档✅ 示例代码 我们将以 fmt 这个流行的格式化库为例,演示如何: 使用 FetchContent 从 GitH…...
【HarmonyOS 5 开发速记】如何获取用户信息(头像/昵称/手机号)
1.获取 authorizationCode: 2.利用 authorizationCode 获取 accessToken:文档中心 3.获取手机:文档中心 4.获取昵称头像:文档中心 首先创建 request 若要获取手机号,scope必填 phone,permissions 必填 …...
管理学院权限管理系统开发总结
文章目录 🎓 管理学院权限管理系统开发总结 - 现代化Web应用实践之路📝 项目概述🏗️ 技术架构设计后端技术栈前端技术栈 💡 核心功能特性1. 用户管理模块2. 权限管理系统3. 统计报表功能4. 用户体验优化 🗄️ 数据库设…...

HDFS分布式存储 zookeeper
hadoop介绍 狭义上hadoop是指apache的一款开源软件 用java语言实现开源框架,允许使用简单的变成模型跨计算机对大型集群进行分布式处理(1.海量的数据存储 2.海量数据的计算)Hadoop核心组件 hdfs(分布式文件存储系统)&a…...
C++.OpenGL (14/64)多光源(Multiple Lights)
多光源(Multiple Lights) 多光源渲染技术概览 #mermaid-svg-3L5e5gGn76TNh7Lq {font-family:"trebuchet ms",verdana,arial,sans-serif;font-size:16px;fill:#333;}#mermaid-svg-3L5e5gGn76TNh7Lq .error-icon{fill:#552222;}#mermaid-svg-3L5e5gGn76TNh7Lq .erro…...